VirtualBox

source: vbox/trunk/src/libs/libxml2-2.13.2/HTMLparser.c@ 107351

Last change on this file since 107351 was 105420, checked in by vboxsync, 6 months ago

libxml2-2.12.6: Applied and adjusted our libxml2 changes to 2.12.6. bugref:10730

  • Property svn:eol-style set to native
File size: 192.3 KB
Line 
1/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * [email protected]
7 */
8
9#define IN_LIBXML
10#include "libxml.h"
11#ifdef LIBXML_HTML_ENABLED
12
13#include <string.h>
14#include <ctype.h>
15#include <stdlib.h>
16
17#include <libxml/HTMLparser.h>
18#include <libxml/xmlmemory.h>
19#include <libxml/tree.h>
20#include <libxml/parser.h>
21#include <libxml/parserInternals.h>
22#include <libxml/xmlerror.h>
23#include <libxml/HTMLtree.h>
24#include <libxml/entities.h>
25#include <libxml/encoding.h>
26#include <libxml/xmlIO.h>
27#include <libxml/uri.h>
28
29#include "private/buf.h"
30#include "private/enc.h"
31#include "private/error.h"
32#include "private/html.h"
33#include "private/io.h"
34#include "private/parser.h"
35#include "private/tree.h"
36
37#define HTML_MAX_NAMELEN 1000
38#define HTML_PARSER_BIG_BUFFER_SIZE 1000
39#define HTML_PARSER_BUFFER_SIZE 100
40
41static int htmlOmittedDefaultValue = 1;
42
43xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
44 xmlChar end, xmlChar end2, xmlChar end3);
45static void htmlParseComment(htmlParserCtxtPtr ctxt);
46
47/************************************************************************
48 * *
49 * Some factorized error routines *
50 * *
51 ************************************************************************/
52
53/**
54 * htmlErrMemory:
55 * @ctxt: an HTML parser context
56 * @extra: extra information
57 *
58 * Handle a redefinition of attribute error
59 */
60static void
61htmlErrMemory(xmlParserCtxtPtr ctxt)
62{
63 xmlCtxtErrMemory(ctxt);
64}
65
66/**
67 * htmlParseErr:
68 * @ctxt: an HTML parser context
69 * @error: the error number
70 * @msg: the error message
71 * @str1: string infor
72 * @str2: string infor
73 *
74 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
75 */
76static void LIBXML_ATTR_FORMAT(3,0)
77htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
78 const char *msg, const xmlChar *str1, const xmlChar *str2)
79{
80 xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
81 str1, str2, NULL, 0, msg, str1, str2);
82}
83
84/**
85 * htmlParseErrInt:
86 * @ctxt: an HTML parser context
87 * @error: the error number
88 * @msg: the error message
89 * @val: integer info
90 *
91 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
92 */
93static void LIBXML_ATTR_FORMAT(3,0)
94htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
95 const char *msg, int val)
96{
97 xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
98 NULL, NULL, NULL, val, msg, val);
99}
100
101/************************************************************************
102 * *
103 * Parser stacks related functions and macros *
104 * *
105 ************************************************************************/
106
107/**
108 * htmlnamePush:
109 * @ctxt: an HTML parser context
110 * @value: the element name
111 *
112 * Pushes a new element name on top of the name stack
113 *
114 * Returns -1 in case of error, the index in the stack otherwise
115 */
116static int
117htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
118{
119 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
120 ctxt->html = 3;
121 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
122 ctxt->html = 10;
123 if (ctxt->nameNr >= ctxt->nameMax) {
124 size_t newSize = ctxt->nameMax * 2;
125 const xmlChar **tmp;
126
127 tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
128 newSize * sizeof(ctxt->nameTab[0]));
129 if (tmp == NULL) {
130 htmlErrMemory(ctxt);
131 return (-1);
132 }
133 ctxt->nameTab = tmp;
134 ctxt->nameMax = newSize;
135 }
136 ctxt->nameTab[ctxt->nameNr] = value;
137 ctxt->name = value;
138 return (ctxt->nameNr++);
139}
140/**
141 * htmlnamePop:
142 * @ctxt: an HTML parser context
143 *
144 * Pops the top element name from the name stack
145 *
146 * Returns the name just removed
147 */
148static const xmlChar *
149htmlnamePop(htmlParserCtxtPtr ctxt)
150{
151 const xmlChar *ret;
152
153 if (ctxt->nameNr <= 0)
154 return (NULL);
155 ctxt->nameNr--;
156 if (ctxt->nameNr < 0)
157 return (NULL);
158 if (ctxt->nameNr > 0)
159 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
160 else
161 ctxt->name = NULL;
162 ret = ctxt->nameTab[ctxt->nameNr];
163 ctxt->nameTab[ctxt->nameNr] = NULL;
164 return (ret);
165}
166
167/**
168 * htmlNodeInfoPush:
169 * @ctxt: an HTML parser context
170 * @value: the node info
171 *
172 * Pushes a new element name on top of the node info stack
173 *
174 * Returns 0 in case of error, the index in the stack otherwise
175 */
176static int
177htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
178{
179 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
180 if (ctxt->nodeInfoMax == 0)
181 ctxt->nodeInfoMax = 5;
182 ctxt->nodeInfoMax *= 2;
183 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
184 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
185 ctxt->nodeInfoMax *
186 sizeof(ctxt->nodeInfoTab[0]));
187 if (ctxt->nodeInfoTab == NULL) {
188 htmlErrMemory(ctxt);
189 return (0);
190 }
191 }
192 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
193 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
194 return (ctxt->nodeInfoNr++);
195}
196
197/**
198 * htmlNodeInfoPop:
199 * @ctxt: an HTML parser context
200 *
201 * Pops the top element name from the node info stack
202 *
203 * Returns 0 in case of error, the pointer to NodeInfo otherwise
204 */
205static htmlParserNodeInfo *
206htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
207{
208 if (ctxt->nodeInfoNr <= 0)
209 return (NULL);
210 ctxt->nodeInfoNr--;
211 if (ctxt->nodeInfoNr < 0)
212 return (NULL);
213 if (ctxt->nodeInfoNr > 0)
214 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
215 else
216 ctxt->nodeInfo = NULL;
217 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
218}
219
220/*
221 * Macros for accessing the content. Those should be used only by the parser,
222 * and not exported.
223 *
224 * Dirty macros, i.e. one need to make assumption on the context to use them
225 *
226 * CUR_PTR return the current pointer to the xmlChar to be parsed.
227 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
228 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
229 * in UNICODE mode. This should be used internally by the parser
230 * only to compare to ASCII values otherwise it would break when
231 * running with UTF-8 encoding.
232 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
233 * to compare on ASCII based substring.
234 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
235 * it should be used only to compare on ASCII based substring.
236 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
237 * strings without newlines within the parser.
238 *
239 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
240 *
241 * NEXT Skip to the next character, this does the proper decoding
242 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
243 * NEXTL(l) Skip the current unicode character of l xmlChars long.
244 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
245 */
246
247#define UPPER (toupper(*ctxt->input->cur))
248
249#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
250
251#define NXT(val) ctxt->input->cur[(val)]
252
253#define UPP(val) (toupper(ctxt->input->cur[(val)]))
254
255#define CUR_PTR ctxt->input->cur
256#define BASE_PTR ctxt->input->base
257
258#define SHRINK \
259 if ((!PARSER_PROGRESSIVE(ctxt)) && \
260 (ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
261 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
262 xmlParserShrink(ctxt);
263
264#define GROW \
265 if ((!PARSER_PROGRESSIVE(ctxt)) && \
266 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
267 xmlParserGrow(ctxt);
268
269#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
270
271/* Imported from XML */
272
273#define CUR (*ctxt->input->cur)
274#define NEXT xmlNextChar(ctxt)
275
276#define RAW (*ctxt->input->cur)
277
278
279#define NEXTL(l) do { \
280 if (*(ctxt->input->cur) == '\n') { \
281 ctxt->input->line++; ctxt->input->col = 1; \
282 } else ctxt->input->col++; \
283 ctxt->input->cur += l; \
284 } while (0)
285
286/************
287 \
288 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
289 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
290 ************/
291
292#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
293
294#define COPY_BUF(l,b,i,v) \
295 if (l == 1) b[i++] = v; \
296 else i += xmlCopyChar(l,&b[i],v)
297
298/**
299 * htmlFindEncoding:
300 * @the HTML parser context
301 *
302 * Ty to find and encoding in the current data available in the input
303 * buffer this is needed to try to switch to the proper encoding when
304 * one face a character error.
305 * That's an heuristic, since it's operating outside of parsing it could
306 * try to use a meta which had been commented out, that's the reason it
307 * should only be used in case of error, not as a default.
308 *
309 * Returns an encoding string or NULL if not found, the string need to
310 * be freed
311 */
312static xmlChar *
313htmlFindEncoding(xmlParserCtxtPtr ctxt) {
314 const xmlChar *start, *cur, *end;
315 xmlChar *ret;
316
317 if ((ctxt == NULL) || (ctxt->input == NULL) ||
318 (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
319 return(NULL);
320 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
321 return(NULL);
322
323 start = ctxt->input->cur;
324 end = ctxt->input->end;
325 /* we also expect the input buffer to be zero terminated */
326 if (*end != 0)
327 return(NULL);
328
329 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
330 if (cur == NULL)
331 return(NULL);
332 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
333 if (cur == NULL)
334 return(NULL);
335 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
336 if (cur == NULL)
337 return(NULL);
338 cur += 8;
339 start = cur;
340 while (((*cur >= 'A') && (*cur <= 'Z')) ||
341 ((*cur >= 'a') && (*cur <= 'z')) ||
342 ((*cur >= '0') && (*cur <= '9')) ||
343 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
344 cur++;
345 if (cur == start)
346 return(NULL);
347 ret = xmlStrndup(start, cur - start);
348 if (ret == NULL)
349 htmlErrMemory(ctxt);
350 return(ret);
351}
352
353/**
354 * htmlCurrentChar:
355 * @ctxt: the HTML parser context
356 * @len: pointer to the length of the char read
357 *
358 * The current char value, if using UTF-8 this may actually span multiple
359 * bytes in the input buffer. Implement the end of line normalization:
360 * 2.11 End-of-Line Handling
361 * If the encoding is unspecified, in the case we find an ISO-Latin-1
362 * char, then the encoding converter is plugged in automatically.
363 *
364 * Returns the current char value and its length
365 */
366
367static int
368htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
369 const unsigned char *cur;
370 unsigned char c;
371 unsigned int val;
372
373 if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)
374 xmlParserGrow(ctxt);
375
376 if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
377 xmlChar * guess;
378
379 /*
380 * Assume it's a fixed length encoding (1) with
381 * a compatible encoding for the ASCII set, since
382 * HTML constructs only use < 128 chars
383 */
384 if (*ctxt->input->cur < 0x80) {
385 if (*ctxt->input->cur == 0) {
386 if (ctxt->input->cur < ctxt->input->end) {
387 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
388 "Char 0x%X out of allowed range\n", 0);
389 *len = 1;
390 return(' ');
391 } else {
392 *len = 0;
393 return(0);
394 }
395 }
396 *len = 1;
397 return(*ctxt->input->cur);
398 }
399
400 /*
401 * Humm this is bad, do an automatic flow conversion
402 */
403 guess = htmlFindEncoding(ctxt);
404 if (guess == NULL) {
405 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
406 } else {
407 xmlSwitchEncodingName(ctxt, (const char *) guess);
408 xmlFree(guess);
409 }
410 ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
411 }
412
413 /*
414 * We are supposed to handle UTF8, check it's valid
415 * From rfc2044: encoding of the Unicode values on UTF-8:
416 *
417 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
418 * 0000 0000-0000 007F 0xxxxxxx
419 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
420 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
421 *
422 * Check for the 0x110000 limit too
423 */
424 cur = ctxt->input->cur;
425 c = *cur;
426 if (c & 0x80) {
427 size_t avail;
428
429 if ((c & 0x40) == 0)
430 goto encoding_error;
431
432 avail = ctxt->input->end - ctxt->input->cur;
433
434 if ((avail < 2) || ((cur[1] & 0xc0) != 0x80))
435 goto encoding_error;
436 if ((c & 0xe0) == 0xe0) {
437 if ((avail < 3) || ((cur[2] & 0xc0) != 0x80))
438 goto encoding_error;
439 if ((c & 0xf0) == 0xf0) {
440 if (((c & 0xf8) != 0xf0) ||
441 (avail < 4) || ((cur[3] & 0xc0) != 0x80))
442 goto encoding_error;
443 /* 4-byte code */
444 *len = 4;
445 val = (cur[0] & 0x7) << 18;
446 val |= (cur[1] & 0x3f) << 12;
447 val |= (cur[2] & 0x3f) << 6;
448 val |= cur[3] & 0x3f;
449 if (val < 0x10000)
450 goto encoding_error;
451 } else {
452 /* 3-byte code */
453 *len = 3;
454 val = (cur[0] & 0xf) << 12;
455 val |= (cur[1] & 0x3f) << 6;
456 val |= cur[2] & 0x3f;
457 if (val < 0x800)
458 goto encoding_error;
459 }
460 } else {
461 /* 2-byte code */
462 *len = 2;
463 val = (cur[0] & 0x1f) << 6;
464 val |= cur[1] & 0x3f;
465 if (val < 0x80)
466 goto encoding_error;
467 }
468 if (!IS_CHAR(val)) {
469 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
470 "Char 0x%X out of allowed range\n", val);
471 }
472 return(val);
473 } else {
474 if (*ctxt->input->cur == 0) {
475 if (ctxt->input->cur < ctxt->input->end) {
476 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
477 "Char 0x%X out of allowed range\n", 0);
478 *len = 1;
479 return(' ');
480 } else {
481 *len = 0;
482 return(0);
483 }
484 }
485 /* 1-byte code */
486 *len = 1;
487 return(*ctxt->input->cur);
488 }
489
490encoding_error:
491 xmlCtxtErrIO(ctxt, XML_ERR_INVALID_ENCODING, NULL);
492
493 if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0)
494 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
495 *len = 1;
496 return(*ctxt->input->cur);
497}
498
499/**
500 * htmlSkipBlankChars:
501 * @ctxt: the HTML parser context
502 *
503 * skip all blanks character found at that point in the input streams.
504 *
505 * Returns the number of space chars skipped
506 */
507
508static int
509htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
510 int res = 0;
511
512 while (IS_BLANK_CH(*(ctxt->input->cur))) {
513 if (*(ctxt->input->cur) == '\n') {
514 ctxt->input->line++; ctxt->input->col = 1;
515 } else ctxt->input->col++;
516 ctxt->input->cur++;
517 if (*ctxt->input->cur == 0)
518 xmlParserGrow(ctxt);
519 if (res < INT_MAX)
520 res++;
521 }
522 return(res);
523}
524
525
526
527/************************************************************************
528 * *
529 * The list of HTML elements and their properties *
530 * *
531 ************************************************************************/
532
533/*
534 * Start Tag: 1 means the start tag can be omitted
535 * End Tag: 1 means the end tag can be omitted
536 * 2 means it's forbidden (empty elements)
537 * 3 means the tag is stylistic and should be closed easily
538 * Depr: this element is deprecated
539 * DTD: 1 means that this element is valid only in the Loose DTD
540 * 2 means that this element is valid only in the Frameset DTD
541 *
542 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
543 , subElements , impliedsubelt , Attributes, userdata
544 */
545
546/* Definitions and a couple of vars for HTML Elements */
547
548#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
549#define NB_FONTSTYLE 8
550#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
551#define NB_PHRASE 10
552#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
553#define NB_SPECIAL 16
554#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
555#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
556#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
557#define NB_BLOCK NB_HEADING + NB_LIST + 14
558#define FORMCTRL "input", "select", "textarea", "label", "button"
559#define NB_FORMCTRL 5
560#define PCDATA
561#define NB_PCDATA 0
562#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
563#define NB_HEADING 6
564#define LIST "ul", "ol", "dir", "menu"
565#define NB_LIST 4
566#define MODIFIER
567#define NB_MODIFIER 0
568#define FLOW BLOCK,INLINE
569#define NB_FLOW NB_BLOCK + NB_INLINE
570#define EMPTY NULL
571
572
573static const char* const html_flow[] = { FLOW, NULL } ;
574static const char* const html_inline[] = { INLINE, NULL } ;
575
576/* placeholders: elts with content but no subelements */
577static const char* const html_pcdata[] = { NULL } ;
578#define html_cdata html_pcdata
579
580
581/* ... and for HTML Attributes */
582
583#define COREATTRS "id", "class", "style", "title"
584#define NB_COREATTRS 4
585#define I18N "lang", "dir"
586#define NB_I18N 2
587#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
588#define NB_EVENTS 9
589#define ATTRS COREATTRS,I18N,EVENTS
590#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
591#define CELLHALIGN "align", "char", "charoff"
592#define NB_CELLHALIGN 3
593#define CELLVALIGN "valign"
594#define NB_CELLVALIGN 1
595
596static const char* const html_attrs[] = { ATTRS, NULL } ;
597static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
598static const char* const core_attrs[] = { COREATTRS, NULL } ;
599static const char* const i18n_attrs[] = { I18N, NULL } ;
600
601
602/* Other declarations that should go inline ... */
603static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
604 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
605 "tabindex", "onfocus", "onblur", NULL } ;
606static const char* const target_attr[] = { "target", NULL } ;
607static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
608static const char* const alt_attr[] = { "alt", NULL } ;
609static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
610static const char* const href_attrs[] = { "href", NULL } ;
611static const char* const clear_attrs[] = { "clear", NULL } ;
612static const char* const inline_p[] = { INLINE, "p", NULL } ;
613
614static const char* const flow_param[] = { FLOW, "param", NULL } ;
615static const char* const applet_attrs[] = { COREATTRS , "codebase",
616 "archive", "alt", "name", "height", "width", "align",
617 "hspace", "vspace", NULL } ;
618static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
619 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
620static const char* const basefont_attrs[] =
621 { "id", "size", "color", "face", NULL } ;
622static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
623static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
624static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
625static const char* const body_depr[] = { "background", "bgcolor", "text",
626 "link", "vlink", "alink", NULL } ;
627static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
628 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
629
630
631static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
632static const char* const col_elt[] = { "col", NULL } ;
633static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
634static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
635static const char* const dl_contents[] = { "dt", "dd", NULL } ;
636static const char* const compact_attr[] = { "compact", NULL } ;
637static const char* const label_attr[] = { "label", NULL } ;
638static const char* const fieldset_contents[] = { FLOW, "legend" } ;
639static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
640static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
641static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
642static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
643static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
644static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
645static const char* const head_attrs[] = { I18N, "profile", NULL } ;
646static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
647static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
648static const char* const version_attr[] = { "version", NULL } ;
649static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
650static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
651static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
652static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
653static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
654static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
655static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
656static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
657static const char* const align_attr[] = { "align", NULL } ;
658static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
659static const char* const map_contents[] = { BLOCK, "area", NULL } ;
660static const char* const name_attr[] = { "name", NULL } ;
661static const char* const action_attr[] = { "action", NULL } ;
662static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
663static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
664static const char* const content_attr[] = { "content", NULL } ;
665static const char* const type_attr[] = { "type", NULL } ;
666static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
667static const char* const object_contents[] = { FLOW, "param", NULL } ;
668static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
669static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
670static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
671static const char* const option_elt[] = { "option", NULL } ;
672static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
673static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
674static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
675static const char* const width_attr[] = { "width", NULL } ;
676static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
677static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
678static const char* const language_attr[] = { "language", NULL } ;
679static const char* const select_content[] = { "optgroup", "option", NULL } ;
680static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
681static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
682static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
683static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
684static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
685static const char* const tr_elt[] = { "tr", NULL } ;
686static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
687static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
688static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
689static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
690static const char* const tr_contents[] = { "th", "td", NULL } ;
691static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
692static const char* const li_elt[] = { "li", NULL } ;
693static const char* const ul_depr[] = { "type", "compact", NULL} ;
694static const char* const dir_attr[] = { "dir", NULL} ;
695
696#define DECL (const char**)
697
698static const htmlElemDesc
699html40ElementTable[] = {
700{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
701 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
702},
703{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
704 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
705},
706{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
707 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
708},
709{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
710 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
711},
712{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
713 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
714},
715{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
716 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
717},
718{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
719 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
720},
721{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
722 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
723},
724{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
725 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
726},
727{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
728 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
729},
730{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
731 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
732},
733{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
734 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
735},
736{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
737 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
738},
739{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
740 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
741},
742{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
743 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
744},
745{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
746 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
747},
748{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
749 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
750},
751{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
752 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
753},
754{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
755 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
756},
757{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
758 EMPTY , NULL , DECL col_attrs , NULL, NULL
759},
760{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
761 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
762},
763{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
764 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
765},
766{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
767 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
768},
769{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
770 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
771},
772{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
773 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
774},
775{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
776 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
777},
778{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
779 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
780},
781{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
782 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
783},
784{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
785 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
786},
787{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
788 EMPTY, NULL, DECL embed_attrs, NULL, NULL
789},
790{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
791 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
792},
793{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
794 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
795},
796{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
797 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
798},
799{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
800 EMPTY, NULL, NULL, DECL frame_attrs, NULL
801},
802{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
803 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
804},
805{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
806 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
807},
808{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
809 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
810},
811{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
812 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
813},
814{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
815 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
816},
817{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
818 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
819},
820{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
821 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
822},
823{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
824 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
825},
826{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
827 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
828},
829{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
830 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
831},
832{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
833 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
834},
835{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
836 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
837},
838{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
839 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
840},
841{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
842 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
843},
844{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
845 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
846},
847{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
848 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
849},
850{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
851 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
852},
853{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
854 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
855},
856{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
857 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
858},
859{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
860 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
861},
862{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
863 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
864},
865{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
866 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
867},
868{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
869 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
870},
871{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
872 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
873},
874{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
875 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
876},
877{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
878 DECL html_flow, "div", DECL html_attrs, NULL, NULL
879},
880{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
881 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
882},
883{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
884 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
885},
886{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
887 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
888},
889{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
890 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
891},
892{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
893 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
894},
895{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
896 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
897},
898{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
899 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
900},
901{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
902 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
903},
904{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
905 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
906},
907{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
908 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
909},
910{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
911 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
912},
913{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
914 DECL select_content, NULL, DECL select_attrs, NULL, NULL
915},
916{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
917 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
918},
919{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
920 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
921},
922{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
923 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
924},
925{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
926 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
927},
928{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
929 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
930},
931{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
932 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
933},
934{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
935 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
936},
937{ "table", 0, 0, 0, 0, 0, 0, 0, "",
938 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
939},
940{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
941 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
942},
943{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
944 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
945},
946{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
947 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
948},
949{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
950 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
951},
952{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
953 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
954},
955{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
956 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
957},
958{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
959 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
960},
961{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
962 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
963},
964{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
965 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
966},
967{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
968 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
969},
970{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
971 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
972},
973{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
974 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
975}
976};
977
978typedef struct {
979 const char *oldTag;
980 const char *newTag;
981} htmlStartCloseEntry;
982
983/*
984 * start tags that imply the end of current element
985 */
986static const htmlStartCloseEntry htmlStartClose[] = {
987 { "a", "a" },
988 { "a", "fieldset" },
989 { "a", "table" },
990 { "a", "td" },
991 { "a", "th" },
992 { "address", "dd" },
993 { "address", "dl" },
994 { "address", "dt" },
995 { "address", "form" },
996 { "address", "li" },
997 { "address", "ul" },
998 { "b", "center" },
999 { "b", "p" },
1000 { "b", "td" },
1001 { "b", "th" },
1002 { "big", "p" },
1003 { "caption", "col" },
1004 { "caption", "colgroup" },
1005 { "caption", "tbody" },
1006 { "caption", "tfoot" },
1007 { "caption", "thead" },
1008 { "caption", "tr" },
1009 { "col", "col" },
1010 { "col", "colgroup" },
1011 { "col", "tbody" },
1012 { "col", "tfoot" },
1013 { "col", "thead" },
1014 { "col", "tr" },
1015 { "colgroup", "colgroup" },
1016 { "colgroup", "tbody" },
1017 { "colgroup", "tfoot" },
1018 { "colgroup", "thead" },
1019 { "colgroup", "tr" },
1020 { "dd", "dt" },
1021 { "dir", "dd" },
1022 { "dir", "dl" },
1023 { "dir", "dt" },
1024 { "dir", "form" },
1025 { "dir", "ul" },
1026 { "dl", "form" },
1027 { "dl", "li" },
1028 { "dt", "dd" },
1029 { "dt", "dl" },
1030 { "font", "center" },
1031 { "font", "td" },
1032 { "font", "th" },
1033 { "form", "form" },
1034 { "h1", "fieldset" },
1035 { "h1", "form" },
1036 { "h1", "li" },
1037 { "h1", "p" },
1038 { "h1", "table" },
1039 { "h2", "fieldset" },
1040 { "h2", "form" },
1041 { "h2", "li" },
1042 { "h2", "p" },
1043 { "h2", "table" },
1044 { "h3", "fieldset" },
1045 { "h3", "form" },
1046 { "h3", "li" },
1047 { "h3", "p" },
1048 { "h3", "table" },
1049 { "h4", "fieldset" },
1050 { "h4", "form" },
1051 { "h4", "li" },
1052 { "h4", "p" },
1053 { "h4", "table" },
1054 { "h5", "fieldset" },
1055 { "h5", "form" },
1056 { "h5", "li" },
1057 { "h5", "p" },
1058 { "h5", "table" },
1059 { "h6", "fieldset" },
1060 { "h6", "form" },
1061 { "h6", "li" },
1062 { "h6", "p" },
1063 { "h6", "table" },
1064 { "head", "a" },
1065 { "head", "abbr" },
1066 { "head", "acronym" },
1067 { "head", "address" },
1068 { "head", "b" },
1069 { "head", "bdo" },
1070 { "head", "big" },
1071 { "head", "blockquote" },
1072 { "head", "body" },
1073 { "head", "br" },
1074 { "head", "center" },
1075 { "head", "cite" },
1076 { "head", "code" },
1077 { "head", "dd" },
1078 { "head", "dfn" },
1079 { "head", "dir" },
1080 { "head", "div" },
1081 { "head", "dl" },
1082 { "head", "dt" },
1083 { "head", "em" },
1084 { "head", "fieldset" },
1085 { "head", "font" },
1086 { "head", "form" },
1087 { "head", "frameset" },
1088 { "head", "h1" },
1089 { "head", "h2" },
1090 { "head", "h3" },
1091 { "head", "h4" },
1092 { "head", "h5" },
1093 { "head", "h6" },
1094 { "head", "hr" },
1095 { "head", "i" },
1096 { "head", "iframe" },
1097 { "head", "img" },
1098 { "head", "kbd" },
1099 { "head", "li" },
1100 { "head", "listing" },
1101 { "head", "map" },
1102 { "head", "menu" },
1103 { "head", "ol" },
1104 { "head", "p" },
1105 { "head", "pre" },
1106 { "head", "q" },
1107 { "head", "s" },
1108 { "head", "samp" },
1109 { "head", "small" },
1110 { "head", "span" },
1111 { "head", "strike" },
1112 { "head", "strong" },
1113 { "head", "sub" },
1114 { "head", "sup" },
1115 { "head", "table" },
1116 { "head", "tt" },
1117 { "head", "u" },
1118 { "head", "ul" },
1119 { "head", "var" },
1120 { "head", "xmp" },
1121 { "hr", "form" },
1122 { "i", "center" },
1123 { "i", "p" },
1124 { "i", "td" },
1125 { "i", "th" },
1126 { "legend", "fieldset" },
1127 { "li", "li" },
1128 { "link", "body" },
1129 { "link", "frameset" },
1130 { "listing", "dd" },
1131 { "listing", "dl" },
1132 { "listing", "dt" },
1133 { "listing", "fieldset" },
1134 { "listing", "form" },
1135 { "listing", "li" },
1136 { "listing", "table" },
1137 { "listing", "ul" },
1138 { "menu", "dd" },
1139 { "menu", "dl" },
1140 { "menu", "dt" },
1141 { "menu", "form" },
1142 { "menu", "ul" },
1143 { "ol", "form" },
1144 { "option", "optgroup" },
1145 { "option", "option" },
1146 { "p", "address" },
1147 { "p", "blockquote" },
1148 { "p", "body" },
1149 { "p", "caption" },
1150 { "p", "center" },
1151 { "p", "col" },
1152 { "p", "colgroup" },
1153 { "p", "dd" },
1154 { "p", "dir" },
1155 { "p", "div" },
1156 { "p", "dl" },
1157 { "p", "dt" },
1158 { "p", "fieldset" },
1159 { "p", "form" },
1160 { "p", "frameset" },
1161 { "p", "h1" },
1162 { "p", "h2" },
1163 { "p", "h3" },
1164 { "p", "h4" },
1165 { "p", "h5" },
1166 { "p", "h6" },
1167 { "p", "head" },
1168 { "p", "hr" },
1169 { "p", "li" },
1170 { "p", "listing" },
1171 { "p", "menu" },
1172 { "p", "ol" },
1173 { "p", "p" },
1174 { "p", "pre" },
1175 { "p", "table" },
1176 { "p", "tbody" },
1177 { "p", "td" },
1178 { "p", "tfoot" },
1179 { "p", "th" },
1180 { "p", "title" },
1181 { "p", "tr" },
1182 { "p", "ul" },
1183 { "p", "xmp" },
1184 { "pre", "dd" },
1185 { "pre", "dl" },
1186 { "pre", "dt" },
1187 { "pre", "fieldset" },
1188 { "pre", "form" },
1189 { "pre", "li" },
1190 { "pre", "table" },
1191 { "pre", "ul" },
1192 { "s", "p" },
1193 { "script", "noscript" },
1194 { "small", "p" },
1195 { "span", "td" },
1196 { "span", "th" },
1197 { "strike", "p" },
1198 { "style", "body" },
1199 { "style", "frameset" },
1200 { "tbody", "tbody" },
1201 { "tbody", "tfoot" },
1202 { "td", "tbody" },
1203 { "td", "td" },
1204 { "td", "tfoot" },
1205 { "td", "th" },
1206 { "td", "tr" },
1207 { "tfoot", "tbody" },
1208 { "th", "tbody" },
1209 { "th", "td" },
1210 { "th", "tfoot" },
1211 { "th", "th" },
1212 { "th", "tr" },
1213 { "thead", "tbody" },
1214 { "thead", "tfoot" },
1215 { "title", "body" },
1216 { "title", "frameset" },
1217 { "tr", "tbody" },
1218 { "tr", "tfoot" },
1219 { "tr", "tr" },
1220 { "tt", "p" },
1221 { "u", "p" },
1222 { "u", "td" },
1223 { "u", "th" },
1224 { "ul", "address" },
1225 { "ul", "form" },
1226 { "ul", "menu" },
1227 { "ul", "pre" },
1228 { "xmp", "dd" },
1229 { "xmp", "dl" },
1230 { "xmp", "dt" },
1231 { "xmp", "fieldset" },
1232 { "xmp", "form" },
1233 { "xmp", "li" },
1234 { "xmp", "table" },
1235 { "xmp", "ul" }
1236};
1237
1238/*
1239 * The list of HTML elements which are supposed not to have
1240 * CDATA content and where a p element will be implied
1241 *
1242 * TODO: extend that list by reading the HTML SGML DTD on
1243 * implied paragraph
1244 */
1245static const char *const htmlNoContentElements[] = {
1246 "html",
1247 "head",
1248 NULL
1249};
1250
1251/*
1252 * The list of HTML attributes which are of content %Script;
1253 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1254 * it assumes the name starts with 'on'
1255 */
1256static const char *const htmlScriptAttributes[] = {
1257 "onclick",
1258 "ondblclick",
1259 "onmousedown",
1260 "onmouseup",
1261 "onmouseover",
1262 "onmousemove",
1263 "onmouseout",
1264 "onkeypress",
1265 "onkeydown",
1266 "onkeyup",
1267 "onload",
1268 "onunload",
1269 "onfocus",
1270 "onblur",
1271 "onsubmit",
1272 "onreset",
1273 "onchange",
1274 "onselect"
1275};
1276
1277/*
1278 * This table is used by the htmlparser to know what to do with
1279 * broken html pages. By assigning different priorities to different
1280 * elements the parser can decide how to handle extra endtags.
1281 * Endtags are only allowed to close elements with lower or equal
1282 * priority.
1283 */
1284
1285typedef struct {
1286 const char *name;
1287 int priority;
1288} elementPriority;
1289
1290static const elementPriority htmlEndPriority[] = {
1291 {"div", 150},
1292 {"td", 160},
1293 {"th", 160},
1294 {"tr", 170},
1295 {"thead", 180},
1296 {"tbody", 180},
1297 {"tfoot", 180},
1298 {"table", 190},
1299 {"head", 200},
1300 {"body", 200},
1301 {"html", 220},
1302 {NULL, 100} /* Default priority */
1303};
1304
1305/************************************************************************
1306 * *
1307 * functions to handle HTML specific data *
1308 * *
1309 ************************************************************************/
1310
1311/**
1312 * htmlInitAutoClose:
1313 *
1314 * DEPRECATED: This is a no-op.
1315 */
1316void
1317htmlInitAutoClose(void) {
1318}
1319
1320static int
1321htmlCompareTags(const void *key, const void *member) {
1322 const xmlChar *tag = (const xmlChar *) key;
1323 const htmlElemDesc *desc = (const htmlElemDesc *) member;
1324
1325 return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1326}
1327
1328/**
1329 * htmlTagLookup:
1330 * @tag: The tag name in lowercase
1331 *
1332 * Lookup the HTML tag in the ElementTable
1333 *
1334 * Returns the related htmlElemDescPtr or NULL if not found.
1335 */
1336const htmlElemDesc *
1337htmlTagLookup(const xmlChar *tag) {
1338 if (tag == NULL)
1339 return(NULL);
1340
1341 return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1342 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1343 sizeof(htmlElemDesc), htmlCompareTags));
1344}
1345
1346/**
1347 * htmlGetEndPriority:
1348 * @name: The name of the element to look up the priority for.
1349 *
1350 * Return value: The "endtag" priority.
1351 **/
1352static int
1353htmlGetEndPriority (const xmlChar *name) {
1354 int i = 0;
1355
1356 while ((htmlEndPriority[i].name != NULL) &&
1357 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1358 i++;
1359
1360 return(htmlEndPriority[i].priority);
1361}
1362
1363
1364static int
1365htmlCompareStartClose(const void *vkey, const void *member) {
1366 const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1367 const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1368 int ret;
1369
1370 ret = strcmp(key->oldTag, entry->oldTag);
1371 if (ret == 0)
1372 ret = strcmp(key->newTag, entry->newTag);
1373
1374 return(ret);
1375}
1376
1377/**
1378 * htmlCheckAutoClose:
1379 * @newtag: The new tag name
1380 * @oldtag: The old tag name
1381 *
1382 * Checks whether the new tag is one of the registered valid tags for
1383 * closing old.
1384 *
1385 * Returns 0 if no, 1 if yes.
1386 */
1387static int
1388htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1389{
1390 htmlStartCloseEntry key;
1391 void *res;
1392
1393 key.oldTag = (const char *) oldtag;
1394 key.newTag = (const char *) newtag;
1395 res = bsearch(&key, htmlStartClose,
1396 sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1397 sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1398 return(res != NULL);
1399}
1400
1401/**
1402 * htmlAutoCloseOnClose:
1403 * @ctxt: an HTML parser context
1404 * @newtag: The new tag name
1405 * @force: force the tag closure
1406 *
1407 * The HTML DTD allows an ending tag to implicitly close other tags.
1408 */
1409static void
1410htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1411{
1412 const htmlElemDesc *info;
1413 int i, priority;
1414
1415 priority = htmlGetEndPriority(newtag);
1416
1417 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1418
1419 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1420 break;
1421 /*
1422 * A misplaced endtag can only close elements with lower
1423 * or equal priority, so if we find an element with higher
1424 * priority before we find an element with
1425 * matching name, we just ignore this endtag
1426 */
1427 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1428 return;
1429 }
1430 if (i < 0)
1431 return;
1432
1433 while (!xmlStrEqual(newtag, ctxt->name)) {
1434 info = htmlTagLookup(ctxt->name);
1435 if ((info != NULL) && (info->endTag == 3)) {
1436 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1437 "Opening and ending tag mismatch: %s and %s\n",
1438 newtag, ctxt->name);
1439 }
1440 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1441 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1442 htmlnamePop(ctxt);
1443 }
1444}
1445
1446/**
1447 * htmlAutoCloseOnEnd:
1448 * @ctxt: an HTML parser context
1449 *
1450 * Close all remaining tags at the end of the stream
1451 */
1452static void
1453htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1454{
1455 int i;
1456
1457 if (ctxt->nameNr == 0)
1458 return;
1459 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1460 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1461 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1462 htmlnamePop(ctxt);
1463 }
1464}
1465
1466/**
1467 * htmlAutoClose:
1468 * @ctxt: an HTML parser context
1469 * @newtag: The new tag name or NULL
1470 *
1471 * The HTML DTD allows a tag to implicitly close other tags.
1472 * The list is kept in htmlStartClose array. This function is
1473 * called when a new tag has been detected and generates the
1474 * appropriates closes if possible/needed.
1475 * If newtag is NULL this mean we are at the end of the resource
1476 * and we should check
1477 */
1478static void
1479htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1480{
1481 if (newtag == NULL)
1482 return;
1483
1484 while ((ctxt->name != NULL) &&
1485 (htmlCheckAutoClose(newtag, ctxt->name))) {
1486 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1487 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1488 htmlnamePop(ctxt);
1489 }
1490}
1491
1492/**
1493 * htmlAutoCloseTag:
1494 * @doc: the HTML document
1495 * @name: The tag name
1496 * @elem: the HTML element
1497 *
1498 * The HTML DTD allows a tag to implicitly close other tags.
1499 * The list is kept in htmlStartClose array. This function checks
1500 * if the element or one of it's children would autoclose the
1501 * given tag.
1502 *
1503 * Returns 1 if autoclose, 0 otherwise
1504 */
1505int
1506htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1507 htmlNodePtr child;
1508
1509 if (elem == NULL) return(1);
1510 if (xmlStrEqual(name, elem->name)) return(0);
1511 if (htmlCheckAutoClose(elem->name, name)) return(1);
1512 child = elem->children;
1513 while (child != NULL) {
1514 if (htmlAutoCloseTag(doc, name, child)) return(1);
1515 child = child->next;
1516 }
1517 return(0);
1518}
1519
1520/**
1521 * htmlIsAutoClosed:
1522 * @doc: the HTML document
1523 * @elem: the HTML element
1524 *
1525 * The HTML DTD allows a tag to implicitly close other tags.
1526 * The list is kept in htmlStartClose array. This function checks
1527 * if a tag is autoclosed by one of it's child
1528 *
1529 * Returns 1 if autoclosed, 0 otherwise
1530 */
1531int
1532htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1533 htmlNodePtr child;
1534
1535 if (elem == NULL) return(1);
1536 child = elem->children;
1537 while (child != NULL) {
1538 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1539 child = child->next;
1540 }
1541 return(0);
1542}
1543
1544/**
1545 * htmlCheckImplied:
1546 * @ctxt: an HTML parser context
1547 * @newtag: The new tag name
1548 *
1549 * The HTML DTD allows a tag to exists only implicitly
1550 * called when a new tag has been detected and generates the
1551 * appropriates implicit tags if missing
1552 */
1553static void
1554htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1555 int i;
1556
1557 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1558 return;
1559 if (!htmlOmittedDefaultValue)
1560 return;
1561 if (xmlStrEqual(newtag, BAD_CAST"html"))
1562 return;
1563 if (ctxt->nameNr <= 0) {
1564 htmlnamePush(ctxt, BAD_CAST"html");
1565 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1567 }
1568 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1569 return;
1570 if ((ctxt->nameNr <= 1) &&
1571 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1572 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1573 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1574 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1575 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1576 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1577 if (ctxt->html >= 3) {
1578 /* we already saw or generated an <head> before */
1579 return;
1580 }
1581 /*
1582 * dropped OBJECT ... i you put it first BODY will be
1583 * assumed !
1584 */
1585 htmlnamePush(ctxt, BAD_CAST"head");
1586 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1587 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1588 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1589 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1590 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1591 if (ctxt->html >= 10) {
1592 /* we already saw or generated a <body> before */
1593 return;
1594 }
1595 for (i = 0;i < ctxt->nameNr;i++) {
1596 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1597 return;
1598 }
1599 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1600 return;
1601 }
1602 }
1603
1604 htmlnamePush(ctxt, BAD_CAST"body");
1605 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1606 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1607 }
1608}
1609
1610/**
1611 * htmlCheckParagraph
1612 * @ctxt: an HTML parser context
1613 *
1614 * Check whether a p element need to be implied before inserting
1615 * characters in the current element.
1616 *
1617 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1618 * in case of error.
1619 */
1620
1621static int
1622htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1623 const xmlChar *tag;
1624 int i;
1625
1626 if (ctxt == NULL)
1627 return(-1);
1628 tag = ctxt->name;
1629 if (tag == NULL) {
1630 htmlAutoClose(ctxt, BAD_CAST"p");
1631 htmlCheckImplied(ctxt, BAD_CAST"p");
1632 htmlnamePush(ctxt, BAD_CAST"p");
1633 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1634 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1635 return(1);
1636 }
1637 if (!htmlOmittedDefaultValue)
1638 return(0);
1639 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1640 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1641 htmlAutoClose(ctxt, BAD_CAST"p");
1642 htmlCheckImplied(ctxt, BAD_CAST"p");
1643 htmlnamePush(ctxt, BAD_CAST"p");
1644 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1645 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1646 return(1);
1647 }
1648 }
1649 return(0);
1650}
1651
1652/**
1653 * htmlIsScriptAttribute:
1654 * @name: an attribute name
1655 *
1656 * Check if an attribute is of content type Script
1657 *
1658 * Returns 1 is the attribute is a script 0 otherwise
1659 */
1660int
1661htmlIsScriptAttribute(const xmlChar *name) {
1662 unsigned int i;
1663
1664 if (name == NULL)
1665 return(0);
1666 /*
1667 * all script attributes start with 'on'
1668 */
1669 if ((name[0] != 'o') || (name[1] != 'n'))
1670 return(0);
1671 for (i = 0;
1672 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1673 i++) {
1674 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1675 return(1);
1676 }
1677 return(0);
1678}
1679
1680/************************************************************************
1681 * *
1682 * The list of HTML predefined entities *
1683 * *
1684 ************************************************************************/
1685
1686
1687static const htmlEntityDesc html40EntitiesTable[] = {
1688/*
1689 * the 4 absolute ones, plus apostrophe.
1690 */
1691{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1692{ 38, "amp", "ampersand, U+0026 ISOnum" },
1693{ 39, "apos", "single quote" },
1694{ 60, "lt", "less-than sign, U+003C ISOnum" },
1695{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1696
1697/*
1698 * A bunch still in the 128-255 range
1699 * Replacing them depend really on the charset used.
1700 */
1701{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1702{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1703{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1704{ 163, "pound","pound sign, U+00A3 ISOnum" },
1705{ 164, "curren","currency sign, U+00A4 ISOnum" },
1706{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1707{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1708{ 167, "sect", "section sign, U+00A7 ISOnum" },
1709{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1710{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1711{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1712{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1713{ 172, "not", "not sign, U+00AC ISOnum" },
1714{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1715{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1716{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1717{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1718{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1719{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1720{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1721{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1722{ 181, "micro","micro sign, U+00B5 ISOnum" },
1723{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1724{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1725{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1726{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1727{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1728{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1729{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1730{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1731{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1732{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1733{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1734{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1735{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1736{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1737{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1738{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1739{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1740{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1741{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1742{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1743{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1744{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1745{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1746{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1747{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1748{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1749{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1750{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1751{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1752{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1753{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1754{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1755{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1756{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1757{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1758{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1759{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1760{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1761{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1762{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1763{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1764{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1765{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1766{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1767{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1768{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1769{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1770{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1771{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1772{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1773{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1774{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1775{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1776{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1777{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1778{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1779{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1780{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1781{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1782{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1783{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1784{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1785{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1786{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1787{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1788{ 247, "divide","division sign, U+00F7 ISOnum" },
1789{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1790{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1791{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1792{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1793{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1794{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1795{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1796{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1797
1798{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1799{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1800{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1801{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1802{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1803
1804/*
1805 * Anything below should really be kept as entities references
1806 */
1807{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1808
1809{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1810{ 732, "tilde","small tilde, U+02DC ISOdia" },
1811
1812{ 913, "Alpha","greek capital letter alpha, U+0391" },
1813{ 914, "Beta", "greek capital letter beta, U+0392" },
1814{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1815{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1816{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1817{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1818{ 919, "Eta", "greek capital letter eta, U+0397" },
1819{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1820{ 921, "Iota", "greek capital letter iota, U+0399" },
1821{ 922, "Kappa","greek capital letter kappa, U+039A" },
1822{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1823{ 924, "Mu", "greek capital letter mu, U+039C" },
1824{ 925, "Nu", "greek capital letter nu, U+039D" },
1825{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1826{ 927, "Omicron","greek capital letter omicron, U+039F" },
1827{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1828{ 929, "Rho", "greek capital letter rho, U+03A1" },
1829{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1830{ 932, "Tau", "greek capital letter tau, U+03A4" },
1831{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1832{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1833{ 935, "Chi", "greek capital letter chi, U+03A7" },
1834{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1835{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1836
1837{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1838{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1839{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1840{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1841{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1842{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1843{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1844{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1845{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1846{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1847{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1848{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1849{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1850{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1851{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1852{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1853{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1854{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1855{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1856{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1857{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1858{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1859{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1860{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1861{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1862{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1863{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1864{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1865
1866{ 8194, "ensp", "en space, U+2002 ISOpub" },
1867{ 8195, "emsp", "em space, U+2003 ISOpub" },
1868{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1869{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1870{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1871{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1872{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1873{ 8211, "ndash","en dash, U+2013 ISOpub" },
1874{ 8212, "mdash","em dash, U+2014 ISOpub" },
1875{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1876{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1877{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1878{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1879{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1880{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1881{ 8224, "dagger","dagger, U+2020 ISOpub" },
1882{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1883
1884{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1885{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1886
1887{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1888
1889{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1890{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1891
1892{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1893{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1894
1895{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1896{ 8260, "frasl","fraction slash, U+2044 NEW" },
1897
1898{ 8364, "euro", "euro sign, U+20AC NEW" },
1899
1900{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1901{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1902{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1903{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1904{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1905{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1906{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1907{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1908{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1909{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1910{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1911{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1912{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1913{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1914{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1915{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1916
1917{ 8704, "forall","for all, U+2200 ISOtech" },
1918{ 8706, "part", "partial differential, U+2202 ISOtech" },
1919{ 8707, "exist","there exists, U+2203 ISOtech" },
1920{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1921{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1922{ 8712, "isin", "element of, U+2208 ISOtech" },
1923{ 8713, "notin","not an element of, U+2209 ISOtech" },
1924{ 8715, "ni", "contains as member, U+220B ISOtech" },
1925{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1926{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1927{ 8722, "minus","minus sign, U+2212 ISOtech" },
1928{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1929{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1930{ 8733, "prop", "proportional to, U+221D ISOtech" },
1931{ 8734, "infin","infinity, U+221E ISOtech" },
1932{ 8736, "ang", "angle, U+2220 ISOamso" },
1933{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1934{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1935{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1936{ 8746, "cup", "union = cup, U+222A ISOtech" },
1937{ 8747, "int", "integral, U+222B ISOtech" },
1938{ 8756, "there4","therefore, U+2234 ISOtech" },
1939{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1940{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1941{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1942{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1943{ 8801, "equiv","identical to, U+2261 ISOtech" },
1944{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1945{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1946{ 8834, "sub", "subset of, U+2282 ISOtech" },
1947{ 8835, "sup", "superset of, U+2283 ISOtech" },
1948{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1949{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1950{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1951{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1952{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1953{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1954{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1955{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1956{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1957{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1958{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1959{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1960{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1961{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1962
1963{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1964{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1965{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1966{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1967
1968};
1969
1970/************************************************************************
1971 * *
1972 * Commodity functions to handle entities *
1973 * *
1974 ************************************************************************/
1975
1976/*
1977 * Macro used to grow the current buffer.
1978 */
1979#define growBuffer(buffer) { \
1980 xmlChar *tmp; \
1981 buffer##_size *= 2; \
1982 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size); \
1983 if (tmp == NULL) { \
1984 htmlErrMemory(ctxt); \
1985 xmlFree(buffer); \
1986 return(NULL); \
1987 } \
1988 buffer = tmp; \
1989}
1990
1991/**
1992 * htmlEntityLookup:
1993 * @name: the entity name
1994 *
1995 * Lookup the given entity in EntitiesTable
1996 *
1997 * TODO: the linear scan is really ugly, an hash table is really needed.
1998 *
1999 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2000 */
2001const htmlEntityDesc *
2002htmlEntityLookup(const xmlChar *name) {
2003 unsigned int i;
2004
2005 for (i = 0;i < (sizeof(html40EntitiesTable)/
2006 sizeof(html40EntitiesTable[0]));i++) {
2007 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2008 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2009 }
2010 }
2011 return(NULL);
2012}
2013
2014static int
2015htmlCompareEntityDesc(const void *vkey, const void *vdesc) {
2016 const unsigned *key = vkey;
2017 const htmlEntityDesc *desc = vdesc;
2018
2019 return((int) *key - (int) desc->value);
2020}
2021
2022/**
2023 * htmlEntityValueLookup:
2024 * @value: the entity's unicode value
2025 *
2026 * Lookup the given entity in EntitiesTable
2027 *
2028 * TODO: the linear scan is really ugly, an hash table is really needed.
2029 *
2030 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2031 */
2032const htmlEntityDesc *
2033htmlEntityValueLookup(unsigned int value) {
2034 const htmlEntityDesc *desc;
2035 size_t nmemb;
2036
2037 nmemb = sizeof(html40EntitiesTable) / sizeof(html40EntitiesTable[0]);
2038 desc = bsearch(&value, html40EntitiesTable, nmemb, sizeof(htmlEntityDesc),
2039 htmlCompareEntityDesc);
2040
2041 return(desc);
2042}
2043
2044/**
2045 * UTF8ToHtml:
2046 * @out: a pointer to an array of bytes to store the result
2047 * @outlen: the length of @out
2048 * @in: a pointer to an array of UTF-8 chars
2049 * @inlen: the length of @in
2050 *
2051 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2052 * plus HTML entities block of chars out.
2053 *
2054 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2055 * The value of @inlen after return is the number of octets consumed
2056 * as the return value is positive, else unpredictable.
2057 * The value of @outlen after return is the number of octets consumed.
2058 */
2059int
2060UTF8ToHtml(unsigned char* out, int *outlen,
2061 const unsigned char* in, int *inlen) {
2062 const unsigned char* processed = in;
2063 const unsigned char* outend;
2064 const unsigned char* outstart = out;
2065 const unsigned char* instart = in;
2066 const unsigned char* inend;
2067 unsigned int c, d;
2068 int trailing;
2069
2070 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2071 if (in == NULL) {
2072 /*
2073 * initialization nothing to do
2074 */
2075 *outlen = 0;
2076 *inlen = 0;
2077 return(0);
2078 }
2079 inend = in + (*inlen);
2080 outend = out + (*outlen);
2081 while (in < inend) {
2082 d = *in++;
2083 if (d < 0x80) { c= d; trailing= 0; }
2084 else if (d < 0xC0) {
2085 /* trailing byte in leading position */
2086 *outlen = out - outstart;
2087 *inlen = processed - instart;
2088 return(-2);
2089 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2090 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2091 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2092 else {
2093 /* no chance for this in Ascii */
2094 *outlen = out - outstart;
2095 *inlen = processed - instart;
2096 return(-2);
2097 }
2098
2099 if (inend - in < trailing) {
2100 break;
2101 }
2102
2103 for ( ; trailing; trailing--) {
2104 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2105 break;
2106 c <<= 6;
2107 c |= d & 0x3F;
2108 }
2109
2110 /* assertion: c is a single UTF-4 value */
2111 if (c < 0x80) {
2112 if (out + 1 >= outend)
2113 break;
2114 *out++ = c;
2115 } else {
2116 int len;
2117 const htmlEntityDesc * ent;
2118 const char *cp;
2119 char nbuf[16];
2120
2121 /*
2122 * Try to lookup a predefined HTML entity for it
2123 */
2124
2125 ent = htmlEntityValueLookup(c);
2126 if (ent == NULL) {
2127 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2128 cp = nbuf;
2129 }
2130 else
2131 cp = ent->name;
2132 len = strlen(cp);
2133 if (out + 2 + len >= outend)
2134 break;
2135 *out++ = '&';
2136 memcpy(out, cp, len);
2137 out += len;
2138 *out++ = ';';
2139 }
2140 processed = in;
2141 }
2142 *outlen = out - outstart;
2143 *inlen = processed - instart;
2144 return(0);
2145}
2146
2147/**
2148 * htmlEncodeEntities:
2149 * @out: a pointer to an array of bytes to store the result
2150 * @outlen: the length of @out
2151 * @in: a pointer to an array of UTF-8 chars
2152 * @inlen: the length of @in
2153 * @quoteChar: the quote character to escape (' or ") or zero.
2154 *
2155 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2156 * plus HTML entities block of chars out.
2157 *
2158 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2159 * The value of @inlen after return is the number of octets consumed
2160 * as the return value is positive, else unpredictable.
2161 * The value of @outlen after return is the number of octets consumed.
2162 */
2163int
2164htmlEncodeEntities(unsigned char* out, int *outlen,
2165 const unsigned char* in, int *inlen, int quoteChar) {
2166 const unsigned char* processed = in;
2167 const unsigned char* outend;
2168 const unsigned char* outstart = out;
2169 const unsigned char* instart = in;
2170 const unsigned char* inend;
2171 unsigned int c, d;
2172 int trailing;
2173
2174 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2175 return(-1);
2176 outend = out + (*outlen);
2177 inend = in + (*inlen);
2178 while (in < inend) {
2179 d = *in++;
2180 if (d < 0x80) { c= d; trailing= 0; }
2181 else if (d < 0xC0) {
2182 /* trailing byte in leading position */
2183 *outlen = out - outstart;
2184 *inlen = processed - instart;
2185 return(-2);
2186 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2187 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2188 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2189 else {
2190 /* no chance for this in Ascii */
2191 *outlen = out - outstart;
2192 *inlen = processed - instart;
2193 return(-2);
2194 }
2195
2196 if (inend - in < trailing)
2197 break;
2198
2199 while (trailing--) {
2200 if (((d= *in++) & 0xC0) != 0x80) {
2201 *outlen = out - outstart;
2202 *inlen = processed - instart;
2203 return(-2);
2204 }
2205 c <<= 6;
2206 c |= d & 0x3F;
2207 }
2208
2209 /* assertion: c is a single UTF-4 value */
2210 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2211 (c != '&') && (c != '<') && (c != '>')) {
2212 if (out >= outend)
2213 break;
2214 *out++ = c;
2215 } else {
2216 const htmlEntityDesc * ent;
2217 const char *cp;
2218 char nbuf[16];
2219 int len;
2220
2221 /*
2222 * Try to lookup a predefined HTML entity for it
2223 */
2224 ent = htmlEntityValueLookup(c);
2225 if (ent == NULL) {
2226 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2227 cp = nbuf;
2228 }
2229 else
2230 cp = ent->name;
2231 len = strlen(cp);
2232 if (outend - out < len + 2)
2233 break;
2234 *out++ = '&';
2235 memcpy(out, cp, len);
2236 out += len;
2237 *out++ = ';';
2238 }
2239 processed = in;
2240 }
2241 *outlen = out - outstart;
2242 *inlen = processed - instart;
2243 return(0);
2244}
2245
2246/************************************************************************
2247 * *
2248 * Commodity functions, cleanup needed ? *
2249 * *
2250 ************************************************************************/
2251/*
2252 * all tags allowing pc data from the html 4.01 loose dtd
2253 * NOTE: it might be more appropriate to integrate this information
2254 * into the html40ElementTable array but I don't want to risk any
2255 * binary incompatibility
2256 */
2257static const char *allowPCData[] = {
2258 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2259 "blockquote", "body", "button", "caption", "center", "cite", "code",
2260 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2261 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2262 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2263 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2264};
2265
2266/**
2267 * areBlanks:
2268 * @ctxt: an HTML parser context
2269 * @str: a xmlChar *
2270 * @len: the size of @str
2271 *
2272 * Is this a sequence of blank chars that one can ignore ?
2273 *
2274 * Returns 1 if ignorable 0 otherwise.
2275 */
2276
2277static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2278 unsigned int i;
2279 int j;
2280 xmlNodePtr lastChild;
2281 xmlDtdPtr dtd;
2282
2283 for (j = 0;j < len;j++)
2284 if (!(IS_BLANK_CH(str[j]))) return(0);
2285
2286 if (CUR == 0) return(1);
2287 if (CUR != '<') return(0);
2288 if (ctxt->name == NULL)
2289 return(1);
2290 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2291 return(1);
2292 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2293 return(1);
2294
2295 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2296 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2297 dtd = xmlGetIntSubset(ctxt->myDoc);
2298 if (dtd != NULL && dtd->ExternalID != NULL) {
2299 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2300 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2301 return(1);
2302 }
2303 }
2304
2305 if (ctxt->node == NULL) return(0);
2306 lastChild = xmlGetLastChild(ctxt->node);
2307 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2308 lastChild = lastChild->prev;
2309 if (lastChild == NULL) {
2310 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2311 (ctxt->node->content != NULL)) return(0);
2312 /* keep ws in constructs like ...<b> </b>...
2313 for all tags "b" allowing PCDATA */
2314 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2315 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2316 return(0);
2317 }
2318 }
2319 } else if (xmlNodeIsText(lastChild)) {
2320 return(0);
2321 } else {
2322 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2323 for all tags "p" allowing PCDATA */
2324 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2325 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2326 return(0);
2327 }
2328 }
2329 }
2330 return(1);
2331}
2332
2333/**
2334 * htmlNewDocNoDtD:
2335 * @URI: URI for the dtd, or NULL
2336 * @ExternalID: the external ID of the DTD, or NULL
2337 *
2338 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2339 * are NULL
2340 *
2341 * Returns a new document, do not initialize the DTD if not provided
2342 */
2343htmlDocPtr
2344htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2345 xmlDocPtr cur;
2346
2347 /*
2348 * Allocate a new document and fill the fields.
2349 */
2350 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2351 if (cur == NULL)
2352 return(NULL);
2353 memset(cur, 0, sizeof(xmlDoc));
2354
2355 cur->type = XML_HTML_DOCUMENT_NODE;
2356 cur->version = NULL;
2357 cur->intSubset = NULL;
2358 cur->doc = cur;
2359 cur->name = NULL;
2360 cur->children = NULL;
2361 cur->extSubset = NULL;
2362 cur->oldNs = NULL;
2363 cur->encoding = NULL;
2364 cur->standalone = 1;
2365 cur->compression = 0;
2366 cur->ids = NULL;
2367 cur->refs = NULL;
2368 cur->_private = NULL;
2369 cur->charset = XML_CHAR_ENCODING_UTF8;
2370 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2371 if ((ExternalID != NULL) ||
2372 (URI != NULL)) {
2373 xmlDtdPtr intSubset;
2374
2375 intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2376 if (intSubset == NULL) {
2377 xmlFree(cur);
2378 return(NULL);
2379 }
2380 }
2381 if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2382 xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2383 return(cur);
2384}
2385
2386/**
2387 * htmlNewDoc:
2388 * @URI: URI for the dtd, or NULL
2389 * @ExternalID: the external ID of the DTD, or NULL
2390 *
2391 * Creates a new HTML document
2392 *
2393 * Returns a new document
2394 */
2395htmlDocPtr
2396htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2397 if ((URI == NULL) && (ExternalID == NULL))
2398 return(htmlNewDocNoDtD(
2399 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2400 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2401
2402 return(htmlNewDocNoDtD(URI, ExternalID));
2403}
2404
2405
2406/************************************************************************
2407 * *
2408 * The parser itself *
2409 * Relates to http://www.w3.org/TR/html40 *
2410 * *
2411 ************************************************************************/
2412
2413/************************************************************************
2414 * *
2415 * The parser itself *
2416 * *
2417 ************************************************************************/
2418
2419static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2420
2421static void
2422htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2423 int c;
2424
2425 htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2426 "Incorrectly opened comment\n", NULL, NULL);
2427
2428 while (PARSER_STOPPED(ctxt) == 0) {
2429 c = CUR;
2430 if (c == 0)
2431 break;
2432 NEXT;
2433 if (c == '>')
2434 break;
2435 }
2436}
2437
2438/**
2439 * htmlParseHTMLName:
2440 * @ctxt: an HTML parser context
2441 *
2442 * parse an HTML tag or attribute name, note that we convert it to lowercase
2443 * since HTML names are not case-sensitive.
2444 *
2445 * Returns the Tag Name parsed or NULL
2446 */
2447
2448static const xmlChar *
2449htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2450 const xmlChar *ret;
2451 int i = 0;
2452 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2453
2454 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2455 (CUR != ':') && (CUR != '.')) return(NULL);
2456
2457 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2458 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2459 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2460 (CUR == '.'))) {
2461 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2462 else loc[i] = CUR;
2463 i++;
2464
2465 NEXT;
2466 }
2467
2468 ret = xmlDictLookup(ctxt->dict, loc, i);
2469 if (ret == NULL)
2470 htmlErrMemory(ctxt);
2471
2472 return(ret);
2473}
2474
2475
2476/**
2477 * htmlParseHTMLName_nonInvasive:
2478 * @ctxt: an HTML parser context
2479 *
2480 * parse an HTML tag or attribute name, note that we convert it to lowercase
2481 * since HTML names are not case-sensitive, this doesn't consume the data
2482 * from the stream, it's a look-ahead
2483 *
2484 * Returns the Tag Name parsed or NULL
2485 */
2486
2487static const xmlChar *
2488htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2489 int i = 0;
2490 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2491 const xmlChar *ret;
2492
2493 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2494 (NXT(1) != ':')) return(NULL);
2495
2496 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2497 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2498 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2499 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2500 else loc[i] = NXT(1+i);
2501 i++;
2502 }
2503
2504 ret = xmlDictLookup(ctxt->dict, loc, i);
2505 if (ret == NULL)
2506 htmlErrMemory(ctxt);
2507
2508 return(ret);
2509}
2510
2511
2512/**
2513 * htmlParseName:
2514 * @ctxt: an HTML parser context
2515 *
2516 * parse an HTML name, this routine is case sensitive.
2517 *
2518 * Returns the Name parsed or NULL
2519 */
2520
2521static const xmlChar *
2522htmlParseName(htmlParserCtxtPtr ctxt) {
2523 const xmlChar *in;
2524 const xmlChar *ret;
2525 int count = 0;
2526
2527 GROW;
2528
2529 /*
2530 * Accelerator for simple ASCII names
2531 */
2532 in = ctxt->input->cur;
2533 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2534 ((*in >= 0x41) && (*in <= 0x5A)) ||
2535 (*in == '_') || (*in == ':')) {
2536 in++;
2537 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2538 ((*in >= 0x41) && (*in <= 0x5A)) ||
2539 ((*in >= 0x30) && (*in <= 0x39)) ||
2540 (*in == '_') || (*in == '-') ||
2541 (*in == ':') || (*in == '.'))
2542 in++;
2543
2544 if (in == ctxt->input->end)
2545 return(NULL);
2546
2547 if ((*in > 0) && (*in < 0x80)) {
2548 count = in - ctxt->input->cur;
2549 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2550 if (ret == NULL)
2551 htmlErrMemory(ctxt);
2552 ctxt->input->cur = in;
2553 ctxt->input->col += count;
2554 return(ret);
2555 }
2556 }
2557 return(htmlParseNameComplex(ctxt));
2558}
2559
2560static const xmlChar *
2561htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2562 int len = 0, l;
2563 int c;
2564 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2565 XML_MAX_TEXT_LENGTH :
2566 XML_MAX_NAME_LENGTH;
2567 const xmlChar *base = ctxt->input->base;
2568 const xmlChar *ret;
2569
2570 /*
2571 * Handler for more complex cases
2572 */
2573 c = CUR_CHAR(l);
2574 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2575 (!IS_LETTER(c) && (c != '_') &&
2576 (c != ':'))) {
2577 return(NULL);
2578 }
2579
2580 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2581 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2582 (c == '.') || (c == '-') ||
2583 (c == '_') || (c == ':') ||
2584 (IS_COMBINING(c)) ||
2585 (IS_EXTENDER(c)))) {
2586 len += l;
2587 if (len > maxLength) {
2588 htmlParseErr(ctxt, XML_ERR_NAME_TOO_LONG, "name too long", NULL, NULL);
2589 return(NULL);
2590 }
2591 NEXTL(l);
2592 c = CUR_CHAR(l);
2593 if (ctxt->input->base != base) {
2594 /*
2595 * We changed encoding from an unknown encoding
2596 * Input buffer changed location, so we better start again
2597 */
2598 return(htmlParseNameComplex(ctxt));
2599 }
2600 }
2601
2602 if (ctxt->input->cur - ctxt->input->base < len) {
2603 /* Sanity check */
2604 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2605 "unexpected change of input buffer", NULL, NULL);
2606 return (NULL);
2607 }
2608
2609 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len);
2610 if (ret == NULL)
2611 htmlErrMemory(ctxt);
2612
2613 return(ret);
2614}
2615
2616
2617/**
2618 * htmlParseHTMLAttribute:
2619 * @ctxt: an HTML parser context
2620 * @stop: a char stop value
2621 *
2622 * parse an HTML attribute value till the stop (quote), if
2623 * stop is 0 then it stops at the first space
2624 *
2625 * Returns the attribute parsed or NULL
2626 */
2627
2628static xmlChar *
2629htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2630 xmlChar *buffer = NULL;
2631 int buffer_size = 0;
2632 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2633 XML_MAX_HUGE_LENGTH :
2634 XML_MAX_TEXT_LENGTH;
2635 xmlChar *out = NULL;
2636 const xmlChar *name = NULL;
2637 const xmlChar *cur = NULL;
2638 const htmlEntityDesc * ent;
2639
2640 /*
2641 * allocate a translation buffer.
2642 */
2643 buffer_size = HTML_PARSER_BUFFER_SIZE;
2644 buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2645 if (buffer == NULL) {
2646 htmlErrMemory(ctxt);
2647 return(NULL);
2648 }
2649 out = buffer;
2650
2651 /*
2652 * Ok loop until we reach one of the ending chars
2653 */
2654 while ((PARSER_STOPPED(ctxt) == 0) &&
2655 (CUR != 0) && (CUR != stop)) {
2656 if ((stop == 0) && (CUR == '>')) break;
2657 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2658 if (CUR == '&') {
2659 if (NXT(1) == '#') {
2660 unsigned int c;
2661 int bits;
2662
2663 c = htmlParseCharRef(ctxt);
2664 if (c < 0x80)
2665 { *out++ = c; bits= -6; }
2666 else if (c < 0x800)
2667 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2668 else if (c < 0x10000)
2669 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2670 else
2671 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2672
2673 for ( ; bits >= 0; bits-= 6) {
2674 *out++ = ((c >> bits) & 0x3F) | 0x80;
2675 }
2676
2677 if (out - buffer > buffer_size - 100) {
2678 int indx = out - buffer;
2679
2680 growBuffer(buffer);
2681 out = &buffer[indx];
2682 }
2683 } else {
2684 ent = htmlParseEntityRef(ctxt, &name);
2685 if (name == NULL) {
2686 *out++ = '&';
2687 if (out - buffer > buffer_size - 100) {
2688 int indx = out - buffer;
2689
2690 growBuffer(buffer);
2691 out = &buffer[indx];
2692 }
2693 } else if (ent == NULL) {
2694 *out++ = '&';
2695 cur = name;
2696 while (*cur != 0) {
2697 if (out - buffer > buffer_size - 100) {
2698 int indx = out - buffer;
2699
2700 growBuffer(buffer);
2701 out = &buffer[indx];
2702 }
2703 *out++ = *cur++;
2704 }
2705 } else {
2706 unsigned int c;
2707 int bits;
2708
2709 if (out - buffer > buffer_size - 100) {
2710 int indx = out - buffer;
2711
2712 growBuffer(buffer);
2713 out = &buffer[indx];
2714 }
2715 c = ent->value;
2716 if (c < 0x80)
2717 { *out++ = c; bits= -6; }
2718 else if (c < 0x800)
2719 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2720 else if (c < 0x10000)
2721 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2722 else
2723 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2724
2725 for ( ; bits >= 0; bits-= 6) {
2726 *out++ = ((c >> bits) & 0x3F) | 0x80;
2727 }
2728 }
2729 }
2730 } else {
2731 unsigned int c;
2732 int bits, l;
2733
2734 if (out - buffer > buffer_size - 100) {
2735 int indx = out - buffer;
2736
2737 growBuffer(buffer);
2738 out = &buffer[indx];
2739 }
2740 c = CUR_CHAR(l);
2741 if (c < 0x80)
2742 { *out++ = c; bits= -6; }
2743 else if (c < 0x800)
2744 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2745 else if (c < 0x10000)
2746 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2747 else
2748 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2749
2750 for ( ; bits >= 0; bits-= 6) {
2751 *out++ = ((c >> bits) & 0x3F) | 0x80;
2752 }
2753 NEXTL(l);
2754 }
2755 if (out - buffer > maxLength) {
2756 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2757 "attribute value too long\n", NULL, NULL);
2758 xmlFree(buffer);
2759 return(NULL);
2760 }
2761 }
2762 *out = 0;
2763 return(buffer);
2764}
2765
2766/**
2767 * htmlParseEntityRef:
2768 * @ctxt: an HTML parser context
2769 * @str: location to store the entity name
2770 *
2771 * DEPRECATED: Internal function, don't use.
2772 *
2773 * parse an HTML ENTITY references
2774 *
2775 * [68] EntityRef ::= '&' Name ';'
2776 *
2777 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2778 * if non-NULL *str will have to be freed by the caller.
2779 */
2780const htmlEntityDesc *
2781htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2782 const xmlChar *name;
2783 const htmlEntityDesc * ent = NULL;
2784
2785 if (str != NULL) *str = NULL;
2786 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2787
2788 if (CUR == '&') {
2789 NEXT;
2790 name = htmlParseName(ctxt);
2791 if (name == NULL) {
2792 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2793 "htmlParseEntityRef: no name\n", NULL, NULL);
2794 } else {
2795 GROW;
2796 if (CUR == ';') {
2797 if (str != NULL)
2798 *str = name;
2799
2800 /*
2801 * Lookup the entity in the table.
2802 */
2803 ent = htmlEntityLookup(name);
2804 if (ent != NULL) /* OK that's ugly !!! */
2805 NEXT;
2806 } else {
2807 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2808 "htmlParseEntityRef: expecting ';'\n",
2809 NULL, NULL);
2810 if (str != NULL)
2811 *str = name;
2812 }
2813 }
2814 }
2815 return(ent);
2816}
2817
2818/**
2819 * htmlParseAttValue:
2820 * @ctxt: an HTML parser context
2821 *
2822 * parse a value for an attribute
2823 * Note: the parser won't do substitution of entities here, this
2824 * will be handled later in xmlStringGetNodeList, unless it was
2825 * asked for ctxt->replaceEntities != 0
2826 *
2827 * Returns the AttValue parsed or NULL.
2828 */
2829
2830static xmlChar *
2831htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2832 xmlChar *ret = NULL;
2833
2834 if (CUR == '"') {
2835 NEXT;
2836 ret = htmlParseHTMLAttribute(ctxt, '"');
2837 if (CUR != '"') {
2838 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2839 "AttValue: \" expected\n", NULL, NULL);
2840 } else
2841 NEXT;
2842 } else if (CUR == '\'') {
2843 NEXT;
2844 ret = htmlParseHTMLAttribute(ctxt, '\'');
2845 if (CUR != '\'') {
2846 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2847 "AttValue: ' expected\n", NULL, NULL);
2848 } else
2849 NEXT;
2850 } else {
2851 /*
2852 * That's an HTMLism, the attribute value may not be quoted
2853 */
2854 ret = htmlParseHTMLAttribute(ctxt, 0);
2855 if (ret == NULL) {
2856 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2857 "AttValue: no value found\n", NULL, NULL);
2858 }
2859 }
2860 return(ret);
2861}
2862
2863/**
2864 * htmlParseSystemLiteral:
2865 * @ctxt: an HTML parser context
2866 *
2867 * parse an HTML Literal
2868 *
2869 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2870 *
2871 * Returns the SystemLiteral parsed or NULL
2872 */
2873
2874static xmlChar *
2875htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2876 size_t len = 0, startPosition = 0;
2877 int err = 0;
2878 int quote;
2879 xmlChar *ret = NULL;
2880
2881 if ((CUR != '"') && (CUR != '\'')) {
2882 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2883 "SystemLiteral \" or ' expected\n", NULL, NULL);
2884 return(NULL);
2885 }
2886 quote = CUR;
2887 NEXT;
2888
2889 if (CUR_PTR < BASE_PTR)
2890 return(ret);
2891 startPosition = CUR_PTR - BASE_PTR;
2892
2893 while ((PARSER_STOPPED(ctxt) == 0) &&
2894 (CUR != 0) && (CUR != quote)) {
2895 /* TODO: Handle UTF-8 */
2896 if (!IS_CHAR_CH(CUR)) {
2897 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2898 "Invalid char in SystemLiteral 0x%X\n", CUR);
2899 err = 1;
2900 }
2901 NEXT;
2902 len++;
2903 }
2904 if (CUR != quote) {
2905 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2906 "Unfinished SystemLiteral\n", NULL, NULL);
2907 } else {
2908 if (err == 0) {
2909 ret = xmlStrndup((BASE_PTR+startPosition), len);
2910 if (ret == NULL) {
2911 htmlErrMemory(ctxt);
2912 return(NULL);
2913 }
2914 }
2915 NEXT;
2916 }
2917
2918 return(ret);
2919}
2920
2921/**
2922 * htmlParsePubidLiteral:
2923 * @ctxt: an HTML parser context
2924 *
2925 * parse an HTML public literal
2926 *
2927 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2928 *
2929 * Returns the PubidLiteral parsed or NULL.
2930 */
2931
2932static xmlChar *
2933htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2934 size_t len = 0, startPosition = 0;
2935 int err = 0;
2936 int quote;
2937 xmlChar *ret = NULL;
2938
2939 if ((CUR != '"') && (CUR != '\'')) {
2940 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2941 "PubidLiteral \" or ' expected\n", NULL, NULL);
2942 return(NULL);
2943 }
2944 quote = CUR;
2945 NEXT;
2946
2947 /*
2948 * Name ::= (Letter | '_') (NameChar)*
2949 */
2950 if (CUR_PTR < BASE_PTR)
2951 return(ret);
2952 startPosition = CUR_PTR - BASE_PTR;
2953
2954 while ((PARSER_STOPPED(ctxt) == 0) &&
2955 (CUR != 0) && (CUR != quote)) {
2956 if (!IS_PUBIDCHAR_CH(CUR)) {
2957 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2958 "Invalid char in PubidLiteral 0x%X\n", CUR);
2959 err = 1;
2960 }
2961 len++;
2962 NEXT;
2963 }
2964
2965 if (CUR != quote) {
2966 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2967 "Unfinished PubidLiteral\n", NULL, NULL);
2968 } else {
2969 if (err == 0) {
2970 ret = xmlStrndup((BASE_PTR + startPosition), len);
2971 if (ret == NULL) {
2972 htmlErrMemory(ctxt);
2973 return(NULL);
2974 }
2975 }
2976 NEXT;
2977 }
2978
2979 return(ret);
2980}
2981
2982/**
2983 * htmlParseScript:
2984 * @ctxt: an HTML parser context
2985 *
2986 * parse the content of an HTML SCRIPT or STYLE element
2987 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2988 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2989 * http://www.w3.org/TR/html4/types.html#type-script
2990 * http://www.w3.org/TR/html4/types.html#h-6.15
2991 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2992 *
2993 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2994 * element and the value of intrinsic event attributes. User agents must
2995 * not evaluate script data as HTML markup but instead must pass it on as
2996 * data to a script engine.
2997 * NOTES:
2998 * - The content is passed like CDATA
2999 * - the attributes for style and scripting "onXXX" are also described
3000 * as CDATA but SGML allows entities references in attributes so their
3001 * processing is identical as other attributes
3002 */
3003static void
3004htmlParseScript(htmlParserCtxtPtr ctxt) {
3005 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3006 int nbchar = 0;
3007 int cur,l;
3008
3009 cur = CUR_CHAR(l);
3010 while (cur != 0) {
3011 if ((cur == '<') && (NXT(1) == '/')) {
3012 /*
3013 * One should break here, the specification is clear:
3014 * Authors should therefore escape "</" within the content.
3015 * Escape mechanisms are specific to each scripting or
3016 * style sheet language.
3017 *
3018 * In recovery mode, only break if end tag match the
3019 * current tag, effectively ignoring all tags inside the
3020 * script/style block and treating the entire block as
3021 * CDATA.
3022 */
3023 if (ctxt->recovery) {
3024 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3025 xmlStrlen(ctxt->name)) == 0)
3026 {
3027 break; /* while */
3028 } else {
3029 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3030 "Element %s embeds close tag\n",
3031 ctxt->name, NULL);
3032 }
3033 } else {
3034 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3035 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3036 {
3037 break; /* while */
3038 }
3039 }
3040 }
3041 if (IS_CHAR(cur)) {
3042 COPY_BUF(l,buf,nbchar,cur);
3043 } else {
3044 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3045 "Invalid char in CDATA 0x%X\n", cur);
3046 }
3047 NEXTL(l);
3048 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3049 buf[nbchar] = 0;
3050 if (ctxt->sax->cdataBlock!= NULL) {
3051 /*
3052 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3053 */
3054 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3055 } else if (ctxt->sax->characters != NULL) {
3056 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3057 }
3058 nbchar = 0;
3059 SHRINK;
3060 }
3061 cur = CUR_CHAR(l);
3062 }
3063
3064 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3065 buf[nbchar] = 0;
3066 if (ctxt->sax->cdataBlock!= NULL) {
3067 /*
3068 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3069 */
3070 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3071 } else if (ctxt->sax->characters != NULL) {
3072 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3073 }
3074 }
3075}
3076
3077
3078/**
3079 * htmlParseCharDataInternal:
3080 * @ctxt: an HTML parser context
3081 * @readahead: optional read ahead character in ascii range
3082 *
3083 * parse a CharData section.
3084 * if we are within a CDATA section ']]>' marks an end of section.
3085 *
3086 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3087 */
3088
3089static void
3090htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3091 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3092 int nbchar = 0;
3093 int cur, l;
3094
3095 if (readahead)
3096 buf[nbchar++] = readahead;
3097
3098 cur = CUR_CHAR(l);
3099 while ((cur != '<') &&
3100 (cur != '&') &&
3101 (cur != 0) &&
3102 (!PARSER_STOPPED(ctxt))) {
3103 if (!(IS_CHAR(cur))) {
3104 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3105 "Invalid char in CDATA 0x%X\n", cur);
3106 } else {
3107 COPY_BUF(l,buf,nbchar,cur);
3108 }
3109 NEXTL(l);
3110 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3111 buf[nbchar] = 0;
3112
3113 /*
3114 * Ok the segment is to be consumed as chars.
3115 */
3116 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3117 if (areBlanks(ctxt, buf, nbchar)) {
3118 if (ctxt->keepBlanks) {
3119 if (ctxt->sax->characters != NULL)
3120 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3121 } else {
3122 if (ctxt->sax->ignorableWhitespace != NULL)
3123 ctxt->sax->ignorableWhitespace(ctxt->userData,
3124 buf, nbchar);
3125 }
3126 } else {
3127 htmlCheckParagraph(ctxt);
3128 if (ctxt->sax->characters != NULL)
3129 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3130 }
3131 }
3132 nbchar = 0;
3133 SHRINK;
3134 }
3135 cur = CUR_CHAR(l);
3136 }
3137 if (nbchar != 0) {
3138 buf[nbchar] = 0;
3139
3140 /*
3141 * Ok the segment is to be consumed as chars.
3142 */
3143 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3144 if (areBlanks(ctxt, buf, nbchar)) {
3145 if (ctxt->keepBlanks) {
3146 if (ctxt->sax->characters != NULL)
3147 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3148 } else {
3149 if (ctxt->sax->ignorableWhitespace != NULL)
3150 ctxt->sax->ignorableWhitespace(ctxt->userData,
3151 buf, nbchar);
3152 }
3153 } else {
3154 htmlCheckParagraph(ctxt);
3155 if (ctxt->sax->characters != NULL)
3156 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3157 }
3158 }
3159 }
3160}
3161
3162/**
3163 * htmlParseCharData:
3164 * @ctxt: an HTML parser context
3165 *
3166 * parse a CharData section.
3167 * if we are within a CDATA section ']]>' marks an end of section.
3168 *
3169 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3170 */
3171
3172static void
3173htmlParseCharData(htmlParserCtxtPtr ctxt) {
3174 htmlParseCharDataInternal(ctxt, 0);
3175}
3176
3177/**
3178 * htmlParseExternalID:
3179 * @ctxt: an HTML parser context
3180 * @publicID: a xmlChar** receiving PubidLiteral
3181 *
3182 * Parse an External ID or a Public ID
3183 *
3184 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3185 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3186 *
3187 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3188 *
3189 * Returns the function returns SystemLiteral and in the second
3190 * case publicID receives PubidLiteral, is strict is off
3191 * it is possible to return NULL and have publicID set.
3192 */
3193
3194static xmlChar *
3195htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3196 xmlChar *URI = NULL;
3197
3198 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3199 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3200 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3201 SKIP(6);
3202 if (!IS_BLANK_CH(CUR)) {
3203 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3204 "Space required after 'SYSTEM'\n", NULL, NULL);
3205 }
3206 SKIP_BLANKS;
3207 URI = htmlParseSystemLiteral(ctxt);
3208 if (URI == NULL) {
3209 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3210 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3211 }
3212 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3213 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3214 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3215 SKIP(6);
3216 if (!IS_BLANK_CH(CUR)) {
3217 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3218 "Space required after 'PUBLIC'\n", NULL, NULL);
3219 }
3220 SKIP_BLANKS;
3221 *publicID = htmlParsePubidLiteral(ctxt);
3222 if (*publicID == NULL) {
3223 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3224 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3225 NULL, NULL);
3226 }
3227 SKIP_BLANKS;
3228 if ((CUR == '"') || (CUR == '\'')) {
3229 URI = htmlParseSystemLiteral(ctxt);
3230 }
3231 }
3232 return(URI);
3233}
3234
3235/**
3236 * htmlParsePI:
3237 * @ctxt: an HTML parser context
3238 *
3239 * Parse an XML Processing Instruction. HTML5 doesn't allow processing
3240 * instructions, so this will be removed at some point.
3241 */
3242static void
3243htmlParsePI(htmlParserCtxtPtr ctxt) {
3244 xmlChar *buf = NULL;
3245 int len = 0;
3246 int size = HTML_PARSER_BUFFER_SIZE;
3247 int cur, l;
3248 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3249 XML_MAX_HUGE_LENGTH :
3250 XML_MAX_TEXT_LENGTH;
3251 const xmlChar *target;
3252 xmlParserInputState state;
3253
3254 if ((RAW == '<') && (NXT(1) == '?')) {
3255 state = ctxt->instate;
3256 ctxt->instate = XML_PARSER_PI;
3257 /*
3258 * this is a Processing Instruction.
3259 */
3260 SKIP(2);
3261
3262 /*
3263 * Parse the target name and check for special support like
3264 * namespace.
3265 */
3266 target = htmlParseName(ctxt);
3267 if (target != NULL) {
3268 if (RAW == '>') {
3269 SKIP(1);
3270
3271 /*
3272 * SAX: PI detected.
3273 */
3274 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3275 (ctxt->sax->processingInstruction != NULL))
3276 ctxt->sax->processingInstruction(ctxt->userData,
3277 target, NULL);
3278 goto done;
3279 }
3280 buf = (xmlChar *) xmlMallocAtomic(size);
3281 if (buf == NULL) {
3282 htmlErrMemory(ctxt);
3283 return;
3284 }
3285 cur = CUR;
3286 if (!IS_BLANK(cur)) {
3287 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3288 "ParsePI: PI %s space expected\n", target, NULL);
3289 }
3290 SKIP_BLANKS;
3291 cur = CUR_CHAR(l);
3292 while ((cur != 0) && (cur != '>')) {
3293 if (len + 5 >= size) {
3294 xmlChar *tmp;
3295
3296 size *= 2;
3297 tmp = (xmlChar *) xmlRealloc(buf, size);
3298 if (tmp == NULL) {
3299 htmlErrMemory(ctxt);
3300 xmlFree(buf);
3301 return;
3302 }
3303 buf = tmp;
3304 }
3305 if (IS_CHAR(cur)) {
3306 COPY_BUF(l,buf,len,cur);
3307 } else {
3308 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3309 "Invalid char in processing instruction "
3310 "0x%X\n", cur);
3311 }
3312 if (len > maxLength) {
3313 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3314 "PI %s too long", target, NULL);
3315 xmlFree(buf);
3316 goto done;
3317 }
3318 NEXTL(l);
3319 cur = CUR_CHAR(l);
3320 }
3321 buf[len] = 0;
3322 if (cur != '>') {
3323 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3324 "ParsePI: PI %s never end ...\n", target, NULL);
3325 } else {
3326 SKIP(1);
3327
3328 /*
3329 * SAX: PI detected.
3330 */
3331 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3332 (ctxt->sax->processingInstruction != NULL))
3333 ctxt->sax->processingInstruction(ctxt->userData,
3334 target, buf);
3335 }
3336 xmlFree(buf);
3337 } else {
3338 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3339 "PI is not started correctly", NULL, NULL);
3340 }
3341
3342done:
3343 ctxt->instate = state;
3344 }
3345}
3346
3347/**
3348 * htmlParseComment:
3349 * @ctxt: an HTML parser context
3350 *
3351 * Parse an HTML comment
3352 */
3353static void
3354htmlParseComment(htmlParserCtxtPtr ctxt) {
3355 xmlChar *buf = NULL;
3356 int len;
3357 int size = HTML_PARSER_BUFFER_SIZE;
3358 int q, ql;
3359 int r, rl;
3360 int cur, l;
3361 int next, nl;
3362 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3363 XML_MAX_HUGE_LENGTH :
3364 XML_MAX_TEXT_LENGTH;
3365 xmlParserInputState state;
3366
3367 /*
3368 * Check that there is a comment right here.
3369 */
3370 if ((RAW != '<') || (NXT(1) != '!') ||
3371 (NXT(2) != '-') || (NXT(3) != '-')) return;
3372
3373 state = ctxt->instate;
3374 ctxt->instate = XML_PARSER_COMMENT;
3375 SKIP(4);
3376 buf = (xmlChar *) xmlMallocAtomic(size);
3377 if (buf == NULL) {
3378 htmlErrMemory(ctxt);
3379 return;
3380 }
3381 len = 0;
3382 buf[len] = 0;
3383 q = CUR_CHAR(ql);
3384 if (q == 0)
3385 goto unfinished;
3386 if (q == '>') {
3387 htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3388 cur = '>';
3389 goto finished;
3390 }
3391 NEXTL(ql);
3392 r = CUR_CHAR(rl);
3393 if (r == 0)
3394 goto unfinished;
3395 if (q == '-' && r == '>') {
3396 htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3397 cur = '>';
3398 goto finished;
3399 }
3400 NEXTL(rl);
3401 cur = CUR_CHAR(l);
3402 while ((cur != 0) &&
3403 ((cur != '>') ||
3404 (r != '-') || (q != '-'))) {
3405 NEXTL(l);
3406 next = CUR_CHAR(nl);
3407
3408 if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3409 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3410 "Comment incorrectly closed by '--!>'", NULL, NULL);
3411 cur = '>';
3412 break;
3413 }
3414
3415 if (len + 5 >= size) {
3416 xmlChar *tmp;
3417
3418 size *= 2;
3419 tmp = (xmlChar *) xmlRealloc(buf, size);
3420 if (tmp == NULL) {
3421 xmlFree(buf);
3422 htmlErrMemory(ctxt);
3423 return;
3424 }
3425 buf = tmp;
3426 }
3427 if (IS_CHAR(q)) {
3428 COPY_BUF(ql,buf,len,q);
3429 } else {
3430 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3431 "Invalid char in comment 0x%X\n", q);
3432 }
3433 if (len > maxLength) {
3434 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3435 "comment too long", NULL, NULL);
3436 xmlFree(buf);
3437 ctxt->instate = state;
3438 return;
3439 }
3440
3441 q = r;
3442 ql = rl;
3443 r = cur;
3444 rl = l;
3445 cur = next;
3446 l = nl;
3447 }
3448finished:
3449 buf[len] = 0;
3450 if (cur == '>') {
3451 NEXT;
3452 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3453 (!ctxt->disableSAX))
3454 ctxt->sax->comment(ctxt->userData, buf);
3455 xmlFree(buf);
3456 ctxt->instate = state;
3457 return;
3458 }
3459
3460unfinished:
3461 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3462 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3463 xmlFree(buf);
3464}
3465
3466/**
3467 * htmlParseCharRef:
3468 * @ctxt: an HTML parser context
3469 *
3470 * DEPRECATED: Internal function, don't use.
3471 *
3472 * parse Reference declarations
3473 *
3474 * [66] CharRef ::= '&#' [0-9]+ ';' |
3475 * '&#x' [0-9a-fA-F]+ ';'
3476 *
3477 * Returns the value parsed (as an int)
3478 */
3479int
3480htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3481 int val = 0;
3482
3483 if ((ctxt == NULL) || (ctxt->input == NULL))
3484 return(0);
3485 if ((CUR == '&') && (NXT(1) == '#') &&
3486 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3487 SKIP(3);
3488 while (CUR != ';') {
3489 if ((CUR >= '0') && (CUR <= '9')) {
3490 if (val < 0x110000)
3491 val = val * 16 + (CUR - '0');
3492 } else if ((CUR >= 'a') && (CUR <= 'f')) {
3493 if (val < 0x110000)
3494 val = val * 16 + (CUR - 'a') + 10;
3495 } else if ((CUR >= 'A') && (CUR <= 'F')) {
3496 if (val < 0x110000)
3497 val = val * 16 + (CUR - 'A') + 10;
3498 } else {
3499 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3500 "htmlParseCharRef: missing semicolon\n",
3501 NULL, NULL);
3502 break;
3503 }
3504 NEXT;
3505 }
3506 if (CUR == ';')
3507 NEXT;
3508 } else if ((CUR == '&') && (NXT(1) == '#')) {
3509 SKIP(2);
3510 while (CUR != ';') {
3511 if ((CUR >= '0') && (CUR <= '9')) {
3512 if (val < 0x110000)
3513 val = val * 10 + (CUR - '0');
3514 } else {
3515 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3516 "htmlParseCharRef: missing semicolon\n",
3517 NULL, NULL);
3518 break;
3519 }
3520 NEXT;
3521 }
3522 if (CUR == ';')
3523 NEXT;
3524 } else {
3525 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3526 "htmlParseCharRef: invalid value\n", NULL, NULL);
3527 }
3528 /*
3529 * Check the value IS_CHAR ...
3530 */
3531 if (IS_CHAR(val)) {
3532 return(val);
3533 } else if (val >= 0x110000) {
3534 htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3535 "htmlParseCharRef: value too large\n", NULL, NULL);
3536 } else {
3537 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3538 "htmlParseCharRef: invalid xmlChar value %d\n",
3539 val);
3540 }
3541 return(0);
3542}
3543
3544
3545/**
3546 * htmlParseDocTypeDecl:
3547 * @ctxt: an HTML parser context
3548 *
3549 * parse a DOCTYPE declaration
3550 *
3551 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3552 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3553 */
3554
3555static void
3556htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3557 const xmlChar *name;
3558 xmlChar *ExternalID = NULL;
3559 xmlChar *URI = NULL;
3560
3561 /*
3562 * We know that '<!DOCTYPE' has been detected.
3563 */
3564 SKIP(9);
3565
3566 SKIP_BLANKS;
3567
3568 /*
3569 * Parse the DOCTYPE name.
3570 */
3571 name = htmlParseName(ctxt);
3572 if (name == NULL) {
3573 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3574 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3575 NULL, NULL);
3576 }
3577 /*
3578 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3579 */
3580
3581 SKIP_BLANKS;
3582
3583 /*
3584 * Check for SystemID and ExternalID
3585 */
3586 URI = htmlParseExternalID(ctxt, &ExternalID);
3587 SKIP_BLANKS;
3588
3589 /*
3590 * We should be at the end of the DOCTYPE declaration.
3591 */
3592 if (CUR != '>') {
3593 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3594 "DOCTYPE improperly terminated\n", NULL, NULL);
3595 /* Ignore bogus content */
3596 while ((CUR != 0) && (CUR != '>') &&
3597 (PARSER_STOPPED(ctxt) == 0))
3598 NEXT;
3599 }
3600 if (CUR == '>')
3601 NEXT;
3602
3603 /*
3604 * Create or update the document accordingly to the DOCTYPE
3605 */
3606 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3607 (!ctxt->disableSAX))
3608 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3609
3610 /*
3611 * Cleanup, since we don't use all those identifiers
3612 */
3613 if (URI != NULL) xmlFree(URI);
3614 if (ExternalID != NULL) xmlFree(ExternalID);
3615}
3616
3617/**
3618 * htmlParseAttribute:
3619 * @ctxt: an HTML parser context
3620 * @value: a xmlChar ** used to store the value of the attribute
3621 *
3622 * parse an attribute
3623 *
3624 * [41] Attribute ::= Name Eq AttValue
3625 *
3626 * [25] Eq ::= S? '=' S?
3627 *
3628 * With namespace:
3629 *
3630 * [NS 11] Attribute ::= QName Eq AttValue
3631 *
3632 * Also the case QName == xmlns:??? is handled independently as a namespace
3633 * definition.
3634 *
3635 * Returns the attribute name, and the value in *value.
3636 */
3637
3638static const xmlChar *
3639htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3640 const xmlChar *name;
3641 xmlChar *val = NULL;
3642
3643 *value = NULL;
3644 name = htmlParseHTMLName(ctxt);
3645 if (name == NULL) {
3646 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3647 "error parsing attribute name\n", NULL, NULL);
3648 return(NULL);
3649 }
3650
3651 /*
3652 * read the value
3653 */
3654 SKIP_BLANKS;
3655 if (CUR == '=') {
3656 NEXT;
3657 SKIP_BLANKS;
3658 val = htmlParseAttValue(ctxt);
3659 }
3660
3661 *value = val;
3662 return(name);
3663}
3664
3665/**
3666 * htmlCheckEncoding:
3667 * @ctxt: an HTML parser context
3668 * @attvalue: the attribute value
3669 *
3670 * Checks an http-equiv attribute from a Meta tag to detect
3671 * the encoding
3672 * If a new encoding is detected the parser is switched to decode
3673 * it and pass UTF8
3674 */
3675static void
3676htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3677 const xmlChar *encoding;
3678 xmlChar *copy;
3679
3680 if (!attvalue)
3681 return;
3682
3683 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3684 if (encoding != NULL) {
3685 encoding += 7;
3686 }
3687 /*
3688 * skip blank
3689 */
3690 if (encoding && IS_BLANK_CH(*encoding))
3691 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3692 if (encoding && *encoding == '=') {
3693 encoding ++;
3694 copy = xmlStrdup(encoding);
3695 if (copy == NULL)
3696 htmlErrMemory(ctxt);
3697 xmlSetDeclaredEncoding(ctxt, copy);
3698 }
3699}
3700
3701/**
3702 * htmlCheckMeta:
3703 * @ctxt: an HTML parser context
3704 * @atts: the attributes values
3705 *
3706 * Checks an attributes from a Meta tag
3707 */
3708static void
3709htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3710 int i;
3711 const xmlChar *att, *value;
3712 int http = 0;
3713 const xmlChar *content = NULL;
3714
3715 if ((ctxt == NULL) || (atts == NULL))
3716 return;
3717
3718 i = 0;
3719 att = atts[i++];
3720 while (att != NULL) {
3721 value = atts[i++];
3722 if (value != NULL) {
3723 if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
3724 (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
3725 http = 1;
3726 } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
3727 xmlChar *copy;
3728
3729 copy = xmlStrdup(value);
3730 if (copy == NULL)
3731 htmlErrMemory(ctxt);
3732 xmlSetDeclaredEncoding(ctxt, copy);
3733 } else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
3734 content = value;
3735 }
3736 }
3737 att = atts[i++];
3738 }
3739 if ((http) && (content != NULL))
3740 htmlCheckEncoding(ctxt, content);
3741
3742}
3743
3744/**
3745 * htmlParseStartTag:
3746 * @ctxt: an HTML parser context
3747 *
3748 * parse a start of tag either for rule element or
3749 * EmptyElement. In both case we don't parse the tag closing chars.
3750 *
3751 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3752 *
3753 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3754 *
3755 * With namespace:
3756 *
3757 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3758 *
3759 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3760 *
3761 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3762 */
3763
3764static int
3765htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3766 const xmlChar *name;
3767 const xmlChar *attname;
3768 xmlChar *attvalue;
3769 const xmlChar **atts;
3770 int nbatts = 0;
3771 int maxatts;
3772 int meta = 0;
3773 int i;
3774 int discardtag = 0;
3775
3776 if ((ctxt == NULL) || (ctxt->input == NULL))
3777 return -1;
3778 if (CUR != '<') return -1;
3779 NEXT;
3780
3781 atts = ctxt->atts;
3782 maxatts = ctxt->maxatts;
3783
3784 GROW;
3785 name = htmlParseHTMLName(ctxt);
3786 if (name == NULL) {
3787 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3788 "htmlParseStartTag: invalid element name\n",
3789 NULL, NULL);
3790 /* Dump the bogus tag like browsers do */
3791 while ((CUR != 0) && (CUR != '>') &&
3792 (PARSER_STOPPED(ctxt) == 0))
3793 NEXT;
3794 return -1;
3795 }
3796 if (xmlStrEqual(name, BAD_CAST"meta"))
3797 meta = 1;
3798
3799 /*
3800 * Check for auto-closure of HTML elements.
3801 */
3802 htmlAutoClose(ctxt, name);
3803
3804 /*
3805 * Check for implied HTML elements.
3806 */
3807 htmlCheckImplied(ctxt, name);
3808
3809 /*
3810 * Avoid html at any level > 0, head at any level != 1
3811 * or any attempt to recurse body
3812 */
3813 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3814 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3815 "htmlParseStartTag: misplaced <html> tag\n",
3816 name, NULL);
3817 discardtag = 1;
3818 ctxt->depth++;
3819 }
3820 if ((ctxt->nameNr != 1) &&
3821 (xmlStrEqual(name, BAD_CAST"head"))) {
3822 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3823 "htmlParseStartTag: misplaced <head> tag\n",
3824 name, NULL);
3825 discardtag = 1;
3826 ctxt->depth++;
3827 }
3828 if (xmlStrEqual(name, BAD_CAST"body")) {
3829 int indx;
3830 for (indx = 0;indx < ctxt->nameNr;indx++) {
3831 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3832 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3833 "htmlParseStartTag: misplaced <body> tag\n",
3834 name, NULL);
3835 discardtag = 1;
3836 ctxt->depth++;
3837 }
3838 }
3839 }
3840
3841 /*
3842 * Now parse the attributes, it ends up with the ending
3843 *
3844 * (S Attribute)* S?
3845 */
3846 SKIP_BLANKS;
3847 while ((CUR != 0) &&
3848 (CUR != '>') &&
3849 ((CUR != '/') || (NXT(1) != '>')) &&
3850 (PARSER_STOPPED(ctxt) == 0)) {
3851 GROW;
3852 attname = htmlParseAttribute(ctxt, &attvalue);
3853 if (attname != NULL) {
3854
3855 /*
3856 * Well formedness requires at most one declaration of an attribute
3857 */
3858 for (i = 0; i < nbatts;i += 2) {
3859 if (xmlStrEqual(atts[i], attname)) {
3860 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3861 "Attribute %s redefined\n", attname, NULL);
3862 if (attvalue != NULL)
3863 xmlFree(attvalue);
3864 goto failed;
3865 }
3866 }
3867
3868 /*
3869 * Add the pair to atts
3870 */
3871 if (atts == NULL) {
3872 maxatts = 22; /* allow for 10 attrs by default */
3873 atts = (const xmlChar **)
3874 xmlMalloc(maxatts * sizeof(xmlChar *));
3875 if (atts == NULL) {
3876 htmlErrMemory(ctxt);
3877 if (attvalue != NULL)
3878 xmlFree(attvalue);
3879 goto failed;
3880 }
3881 ctxt->atts = atts;
3882 ctxt->maxatts = maxatts;
3883 } else if (nbatts + 4 > maxatts) {
3884 const xmlChar **n;
3885
3886 maxatts *= 2;
3887 n = (const xmlChar **) xmlRealloc((void *) atts,
3888 maxatts * sizeof(const xmlChar *));
3889 if (n == NULL) {
3890 htmlErrMemory(ctxt);
3891 if (attvalue != NULL)
3892 xmlFree(attvalue);
3893 goto failed;
3894 }
3895 atts = n;
3896 ctxt->atts = atts;
3897 ctxt->maxatts = maxatts;
3898 }
3899 atts[nbatts++] = attname;
3900 atts[nbatts++] = attvalue;
3901 atts[nbatts] = NULL;
3902 atts[nbatts + 1] = NULL;
3903 }
3904 else {
3905 if (attvalue != NULL)
3906 xmlFree(attvalue);
3907 /* Dump the bogus attribute string up to the next blank or
3908 * the end of the tag. */
3909 while ((CUR != 0) &&
3910 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3911 ((CUR != '/') || (NXT(1) != '>')) &&
3912 (PARSER_STOPPED(ctxt) == 0))
3913 NEXT;
3914 }
3915
3916failed:
3917 SKIP_BLANKS;
3918 }
3919
3920 /*
3921 * Handle specific association to the META tag
3922 */
3923 if (meta && (nbatts != 0))
3924 htmlCheckMeta(ctxt, atts);
3925
3926 /*
3927 * SAX: Start of Element !
3928 */
3929 if (!discardtag) {
3930 htmlnamePush(ctxt, name);
3931 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3932 if (nbatts != 0)
3933 ctxt->sax->startElement(ctxt->userData, name, atts);
3934 else
3935 ctxt->sax->startElement(ctxt->userData, name, NULL);
3936 }
3937 }
3938
3939 if (atts != NULL) {
3940 for (i = 1;i < nbatts;i += 2) {
3941 if (atts[i] != NULL)
3942 xmlFree((xmlChar *) atts[i]);
3943 }
3944 }
3945
3946 return(discardtag);
3947}
3948
3949/**
3950 * htmlParseEndTag:
3951 * @ctxt: an HTML parser context
3952 *
3953 * parse an end of tag
3954 *
3955 * [42] ETag ::= '</' Name S? '>'
3956 *
3957 * With namespace
3958 *
3959 * [NS 9] ETag ::= '</' QName S? '>'
3960 *
3961 * Returns 1 if the current level should be closed.
3962 */
3963
3964static int
3965htmlParseEndTag(htmlParserCtxtPtr ctxt)
3966{
3967 const xmlChar *name;
3968 const xmlChar *oldname;
3969 int i, ret;
3970
3971 if ((CUR != '<') || (NXT(1) != '/')) {
3972 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3973 "htmlParseEndTag: '</' not found\n", NULL, NULL);
3974 return (0);
3975 }
3976 SKIP(2);
3977
3978 name = htmlParseHTMLName(ctxt);
3979 if (name == NULL)
3980 return (0);
3981 /*
3982 * We should definitely be at the ending "S? '>'" part
3983 */
3984 SKIP_BLANKS;
3985 if (CUR != '>') {
3986 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3987 "End tag : expected '>'\n", NULL, NULL);
3988 /* Skip to next '>' */
3989 while ((PARSER_STOPPED(ctxt) == 0) &&
3990 (CUR != 0) && (CUR != '>'))
3991 NEXT;
3992 }
3993 if (CUR == '>')
3994 NEXT;
3995
3996 /*
3997 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3998 * out now.
3999 */
4000 if ((ctxt->depth > 0) &&
4001 (xmlStrEqual(name, BAD_CAST "html") ||
4002 xmlStrEqual(name, BAD_CAST "body") ||
4003 xmlStrEqual(name, BAD_CAST "head"))) {
4004 ctxt->depth--;
4005 return (0);
4006 }
4007
4008 /*
4009 * If the name read is not one of the element in the parsing stack
4010 * then return, it's just an error.
4011 */
4012 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4013 if (xmlStrEqual(name, ctxt->nameTab[i]))
4014 break;
4015 }
4016 if (i < 0) {
4017 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4018 "Unexpected end tag : %s\n", name, NULL);
4019 return (0);
4020 }
4021
4022
4023 /*
4024 * Check for auto-closure of HTML elements.
4025 */
4026
4027 htmlAutoCloseOnClose(ctxt, name);
4028
4029 /*
4030 * Well formedness constraints, opening and closing must match.
4031 * With the exception that the autoclose may have popped stuff out
4032 * of the stack.
4033 */
4034 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4035 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4036 "Opening and ending tag mismatch: %s and %s\n",
4037 name, ctxt->name);
4038 }
4039
4040 /*
4041 * SAX: End of Tag
4042 */
4043 oldname = ctxt->name;
4044 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4045 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4046 ctxt->sax->endElement(ctxt->userData, name);
4047 htmlNodeInfoPop(ctxt);
4048 htmlnamePop(ctxt);
4049 ret = 1;
4050 } else {
4051 ret = 0;
4052 }
4053
4054 return (ret);
4055}
4056
4057
4058/**
4059 * htmlParseReference:
4060 * @ctxt: an HTML parser context
4061 *
4062 * parse and handle entity references in content,
4063 * this will end-up in a call to character() since this is either a
4064 * CharRef, or a predefined entity.
4065 */
4066static void
4067htmlParseReference(htmlParserCtxtPtr ctxt) {
4068 const htmlEntityDesc * ent;
4069 xmlChar out[6];
4070 const xmlChar *name;
4071 if (CUR != '&') return;
4072
4073 if (NXT(1) == '#') {
4074 unsigned int c;
4075 int bits, i = 0;
4076
4077 c = htmlParseCharRef(ctxt);
4078 if (c == 0)
4079 return;
4080
4081 if (c < 0x80) { out[i++]= c; bits= -6; }
4082 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4083 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4084 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4085
4086 for ( ; bits >= 0; bits-= 6) {
4087 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4088 }
4089 out[i] = 0;
4090
4091 htmlCheckParagraph(ctxt);
4092 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4093 ctxt->sax->characters(ctxt->userData, out, i);
4094 } else {
4095 ent = htmlParseEntityRef(ctxt, &name);
4096 if (name == NULL) {
4097 htmlCheckParagraph(ctxt);
4098 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4099 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4100 return;
4101 }
4102 if ((ent == NULL) || !(ent->value > 0)) {
4103 htmlCheckParagraph(ctxt);
4104 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4105 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4106 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4107 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4108 }
4109 } else {
4110 unsigned int c;
4111 int bits, i = 0;
4112
4113 c = ent->value;
4114 if (c < 0x80)
4115 { out[i++]= c; bits= -6; }
4116 else if (c < 0x800)
4117 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4118 else if (c < 0x10000)
4119 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4120 else
4121 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4122
4123 for ( ; bits >= 0; bits-= 6) {
4124 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4125 }
4126 out[i] = 0;
4127
4128 htmlCheckParagraph(ctxt);
4129 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4130 ctxt->sax->characters(ctxt->userData, out, i);
4131 }
4132 }
4133}
4134
4135/**
4136 * htmlParseContent:
4137 * @ctxt: an HTML parser context
4138 *
4139 * Parse a content: comment, sub-element, reference or text.
4140 * Kept for compatibility with old code
4141 */
4142
4143static void
4144htmlParseContent(htmlParserCtxtPtr ctxt) {
4145 xmlChar *currentNode;
4146 int depth;
4147 const xmlChar *name;
4148
4149 currentNode = xmlStrdup(ctxt->name);
4150 depth = ctxt->nameNr;
4151 while (!PARSER_STOPPED(ctxt)) {
4152 GROW;
4153
4154 /*
4155 * Our tag or one of it's parent or children is ending.
4156 */
4157 if ((CUR == '<') && (NXT(1) == '/')) {
4158 if (htmlParseEndTag(ctxt) &&
4159 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4160 if (currentNode != NULL)
4161 xmlFree(currentNode);
4162 return;
4163 }
4164 continue; /* while */
4165 }
4166
4167 else if ((CUR == '<') &&
4168 ((IS_ASCII_LETTER(NXT(1))) ||
4169 (NXT(1) == '_') || (NXT(1) == ':'))) {
4170 name = htmlParseHTMLName_nonInvasive(ctxt);
4171 if (name == NULL) {
4172 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4173 "htmlParseStartTag: invalid element name\n",
4174 NULL, NULL);
4175 /* Dump the bogus tag like browsers do */
4176 while ((CUR != 0) && (CUR != '>'))
4177 NEXT;
4178
4179 if (currentNode != NULL)
4180 xmlFree(currentNode);
4181 return;
4182 }
4183
4184 if (ctxt->name != NULL) {
4185 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4186 htmlAutoClose(ctxt, name);
4187 continue;
4188 }
4189 }
4190 }
4191
4192 /*
4193 * Has this node been popped out during parsing of
4194 * the next element
4195 */
4196 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4197 (!xmlStrEqual(currentNode, ctxt->name)))
4198 {
4199 if (currentNode != NULL) xmlFree(currentNode);
4200 return;
4201 }
4202
4203 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4204 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4205 /*
4206 * Handle SCRIPT/STYLE separately
4207 */
4208 htmlParseScript(ctxt);
4209 }
4210
4211 else if ((CUR == '<') && (NXT(1) == '!')) {
4212 /*
4213 * Sometimes DOCTYPE arrives in the middle of the document
4214 */
4215 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4216 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4217 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4218 (UPP(8) == 'E')) {
4219 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4220 "Misplaced DOCTYPE declaration\n",
4221 BAD_CAST "DOCTYPE" , NULL);
4222 htmlParseDocTypeDecl(ctxt);
4223 }
4224 /*
4225 * First case : a comment
4226 */
4227 else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4228 htmlParseComment(ctxt);
4229 }
4230 else {
4231 htmlSkipBogusComment(ctxt);
4232 }
4233 }
4234
4235 /*
4236 * Second case : a Processing Instruction.
4237 */
4238 else if ((CUR == '<') && (NXT(1) == '?')) {
4239 htmlParsePI(ctxt);
4240 }
4241
4242 /*
4243 * Third case : a sub-element.
4244 */
4245 else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4246 htmlParseElement(ctxt);
4247 }
4248 else if (CUR == '<') {
4249 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4250 (ctxt->sax->characters != NULL))
4251 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4252 NEXT;
4253 }
4254
4255 /*
4256 * Fourth case : a reference. If if has not been resolved,
4257 * parsing returns it's Name, create the node
4258 */
4259 else if (CUR == '&') {
4260 htmlParseReference(ctxt);
4261 }
4262
4263 /*
4264 * Fifth case : end of the resource
4265 */
4266 else if (CUR == 0) {
4267 htmlAutoCloseOnEnd(ctxt);
4268 break;
4269 }
4270
4271 /*
4272 * Last case, text. Note that References are handled directly.
4273 */
4274 else {
4275 htmlParseCharData(ctxt);
4276 }
4277
4278 SHRINK;
4279 GROW;
4280 }
4281 if (currentNode != NULL) xmlFree(currentNode);
4282}
4283
4284/**
4285 * htmlParseElement:
4286 * @ctxt: an HTML parser context
4287 *
4288 * DEPRECATED: Internal function, don't use.
4289 *
4290 * parse an HTML element, this is highly recursive
4291 * this is kept for compatibility with previous code versions
4292 *
4293 * [39] element ::= EmptyElemTag | STag content ETag
4294 *
4295 * [41] Attribute ::= Name Eq AttValue
4296 */
4297
4298void
4299htmlParseElement(htmlParserCtxtPtr ctxt) {
4300 const xmlChar *name;
4301 xmlChar *currentNode = NULL;
4302 const htmlElemDesc * info;
4303 htmlParserNodeInfo node_info;
4304 int failed;
4305 int depth;
4306 const xmlChar *oldptr;
4307
4308 if ((ctxt == NULL) || (ctxt->input == NULL))
4309 return;
4310
4311 /* Capture start position */
4312 if (ctxt->record_info) {
4313 node_info.begin_pos = ctxt->input->consumed +
4314 (CUR_PTR - ctxt->input->base);
4315 node_info.begin_line = ctxt->input->line;
4316 }
4317
4318 failed = htmlParseStartTag(ctxt);
4319 name = ctxt->name;
4320 if ((failed == -1) || (name == NULL)) {
4321 if (CUR == '>')
4322 NEXT;
4323 return;
4324 }
4325
4326 /*
4327 * Lookup the info for that element.
4328 */
4329 info = htmlTagLookup(name);
4330 if (info == NULL) {
4331 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4332 "Tag %s invalid\n", name, NULL);
4333 }
4334
4335 /*
4336 * Check for an Empty Element labeled the XML/SGML way
4337 */
4338 if ((CUR == '/') && (NXT(1) == '>')) {
4339 SKIP(2);
4340 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4341 ctxt->sax->endElement(ctxt->userData, name);
4342 htmlnamePop(ctxt);
4343 return;
4344 }
4345
4346 if (CUR == '>') {
4347 NEXT;
4348 } else {
4349 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4350 "Couldn't find end of Start Tag %s\n", name, NULL);
4351
4352 /*
4353 * end of parsing of this node.
4354 */
4355 if (xmlStrEqual(name, ctxt->name)) {
4356 nodePop(ctxt);
4357 htmlnamePop(ctxt);
4358 }
4359
4360 /*
4361 * Capture end position and add node
4362 */
4363 if (ctxt->record_info) {
4364 node_info.end_pos = ctxt->input->consumed +
4365 (CUR_PTR - ctxt->input->base);
4366 node_info.end_line = ctxt->input->line;
4367 node_info.node = ctxt->node;
4368 xmlParserAddNodeInfo(ctxt, &node_info);
4369 }
4370 return;
4371 }
4372
4373 /*
4374 * Check for an Empty Element from DTD definition
4375 */
4376 if ((info != NULL) && (info->empty)) {
4377 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4378 ctxt->sax->endElement(ctxt->userData, name);
4379 htmlnamePop(ctxt);
4380 return;
4381 }
4382
4383 /*
4384 * Parse the content of the element:
4385 */
4386 currentNode = xmlStrdup(ctxt->name);
4387 depth = ctxt->nameNr;
4388 while (CUR != 0) {
4389 oldptr = ctxt->input->cur;
4390 htmlParseContent(ctxt);
4391 if (oldptr==ctxt->input->cur) break;
4392 if (ctxt->nameNr < depth) break;
4393 }
4394
4395 /*
4396 * Capture end position and add node
4397 */
4398 if ( currentNode != NULL && ctxt->record_info ) {
4399 node_info.end_pos = ctxt->input->consumed +
4400 (CUR_PTR - ctxt->input->base);
4401 node_info.end_line = ctxt->input->line;
4402 node_info.node = ctxt->node;
4403 xmlParserAddNodeInfo(ctxt, &node_info);
4404 }
4405 if (CUR == 0) {
4406 htmlAutoCloseOnEnd(ctxt);
4407 }
4408
4409 if (currentNode != NULL)
4410 xmlFree(currentNode);
4411}
4412
4413static void
4414htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4415 /*
4416 * Capture end position and add node
4417 */
4418 if ( ctxt->node != NULL && ctxt->record_info ) {
4419 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4420 (CUR_PTR - ctxt->input->base);
4421 ctxt->nodeInfo->end_line = ctxt->input->line;
4422 ctxt->nodeInfo->node = ctxt->node;
4423 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4424 htmlNodeInfoPop(ctxt);
4425 }
4426 if (CUR == 0) {
4427 htmlAutoCloseOnEnd(ctxt);
4428 }
4429}
4430
4431/**
4432 * htmlParseElementInternal:
4433 * @ctxt: an HTML parser context
4434 *
4435 * parse an HTML element, new version, non recursive
4436 *
4437 * [39] element ::= EmptyElemTag | STag content ETag
4438 *
4439 * [41] Attribute ::= Name Eq AttValue
4440 */
4441
4442static void
4443htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4444 const xmlChar *name;
4445 const htmlElemDesc * info;
4446 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4447 int failed;
4448
4449 if ((ctxt == NULL) || (ctxt->input == NULL))
4450 return;
4451
4452 /* Capture start position */
4453 if (ctxt->record_info) {
4454 node_info.begin_pos = ctxt->input->consumed +
4455 (CUR_PTR - ctxt->input->base);
4456 node_info.begin_line = ctxt->input->line;
4457 }
4458
4459 failed = htmlParseStartTag(ctxt);
4460 name = ctxt->name;
4461 if ((failed == -1) || (name == NULL)) {
4462 if (CUR == '>')
4463 NEXT;
4464 return;
4465 }
4466
4467 /*
4468 * Lookup the info for that element.
4469 */
4470 info = htmlTagLookup(name);
4471 if (info == NULL) {
4472 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4473 "Tag %s invalid\n", name, NULL);
4474 }
4475
4476 /*
4477 * Check for an Empty Element labeled the XML/SGML way
4478 */
4479 if ((CUR == '/') && (NXT(1) == '>')) {
4480 SKIP(2);
4481 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4482 ctxt->sax->endElement(ctxt->userData, name);
4483 htmlnamePop(ctxt);
4484 return;
4485 }
4486
4487 if (CUR == '>') {
4488 NEXT;
4489 } else {
4490 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4491 "Couldn't find end of Start Tag %s\n", name, NULL);
4492
4493 /*
4494 * end of parsing of this node.
4495 */
4496 if (xmlStrEqual(name, ctxt->name)) {
4497 nodePop(ctxt);
4498 htmlnamePop(ctxt);
4499 }
4500
4501 if (ctxt->record_info)
4502 htmlNodeInfoPush(ctxt, &node_info);
4503 htmlParserFinishElementParsing(ctxt);
4504 return;
4505 }
4506
4507 /*
4508 * Check for an Empty Element from DTD definition
4509 */
4510 if ((info != NULL) && (info->empty)) {
4511 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4512 ctxt->sax->endElement(ctxt->userData, name);
4513 htmlnamePop(ctxt);
4514 return;
4515 }
4516
4517 if (ctxt->record_info)
4518 htmlNodeInfoPush(ctxt, &node_info);
4519}
4520
4521/**
4522 * htmlParseContentInternal:
4523 * @ctxt: an HTML parser context
4524 *
4525 * Parse a content: comment, sub-element, reference or text.
4526 * New version for non recursive htmlParseElementInternal
4527 */
4528
4529static void
4530htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4531 xmlChar *currentNode;
4532 int depth;
4533 const xmlChar *name;
4534
4535 depth = ctxt->nameNr;
4536 if (depth <= 0) {
4537 currentNode = NULL;
4538 } else {
4539 currentNode = xmlStrdup(ctxt->name);
4540 if (currentNode == NULL) {
4541 htmlErrMemory(ctxt);
4542 return;
4543 }
4544 }
4545 while (PARSER_STOPPED(ctxt) == 0) {
4546 GROW;
4547
4548 /*
4549 * Our tag or one of it's parent or children is ending.
4550 */
4551 if ((CUR == '<') && (NXT(1) == '/')) {
4552 if (htmlParseEndTag(ctxt) &&
4553 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4554 if (currentNode != NULL)
4555 xmlFree(currentNode);
4556
4557 depth = ctxt->nameNr;
4558 if (depth <= 0) {
4559 currentNode = NULL;
4560 } else {
4561 currentNode = xmlStrdup(ctxt->name);
4562 if (currentNode == NULL) {
4563 htmlErrMemory(ctxt);
4564 break;
4565 }
4566 }
4567 }
4568 continue; /* while */
4569 }
4570
4571 else if ((CUR == '<') &&
4572 ((IS_ASCII_LETTER(NXT(1))) ||
4573 (NXT(1) == '_') || (NXT(1) == ':'))) {
4574 name = htmlParseHTMLName_nonInvasive(ctxt);
4575 if (name == NULL) {
4576 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4577 "htmlParseStartTag: invalid element name\n",
4578 NULL, NULL);
4579 /* Dump the bogus tag like browsers do */
4580 while ((CUR == 0) && (CUR != '>'))
4581 NEXT;
4582
4583 htmlParserFinishElementParsing(ctxt);
4584 if (currentNode != NULL)
4585 xmlFree(currentNode);
4586
4587 if (ctxt->name == NULL) {
4588 currentNode = NULL;
4589 } else {
4590 currentNode = xmlStrdup(ctxt->name);
4591 if (currentNode == NULL) {
4592 htmlErrMemory(ctxt);
4593 break;
4594 }
4595 }
4596 depth = ctxt->nameNr;
4597 continue;
4598 }
4599
4600 if (ctxt->name != NULL) {
4601 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4602 htmlAutoClose(ctxt, name);
4603 continue;
4604 }
4605 }
4606 }
4607
4608 /*
4609 * Has this node been popped out during parsing of
4610 * the next element
4611 */
4612 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4613 (!xmlStrEqual(currentNode, ctxt->name)))
4614 {
4615 htmlParserFinishElementParsing(ctxt);
4616 if (currentNode != NULL) xmlFree(currentNode);
4617
4618 if (ctxt->name == NULL) {
4619 currentNode = NULL;
4620 } else {
4621 currentNode = xmlStrdup(ctxt->name);
4622 if (currentNode == NULL) {
4623 htmlErrMemory(ctxt);
4624 break;
4625 }
4626 }
4627 depth = ctxt->nameNr;
4628 continue;
4629 }
4630
4631 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4632 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4633 /*
4634 * Handle SCRIPT/STYLE separately
4635 */
4636 htmlParseScript(ctxt);
4637 }
4638
4639 else if ((CUR == '<') && (NXT(1) == '!')) {
4640 /*
4641 * Sometimes DOCTYPE arrives in the middle of the document
4642 */
4643 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4644 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4645 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4646 (UPP(8) == 'E')) {
4647 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4648 "Misplaced DOCTYPE declaration\n",
4649 BAD_CAST "DOCTYPE" , NULL);
4650 htmlParseDocTypeDecl(ctxt);
4651 }
4652 /*
4653 * First case : a comment
4654 */
4655 else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4656 htmlParseComment(ctxt);
4657 }
4658 else {
4659 htmlSkipBogusComment(ctxt);
4660 }
4661 }
4662
4663 /*
4664 * Second case : a Processing Instruction.
4665 */
4666 else if ((CUR == '<') && (NXT(1) == '?')) {
4667 htmlParsePI(ctxt);
4668 }
4669
4670 /*
4671 * Third case : a sub-element.
4672 */
4673 else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4674 htmlParseElementInternal(ctxt);
4675 if (currentNode != NULL) xmlFree(currentNode);
4676
4677 if (ctxt->name == NULL) {
4678 currentNode = NULL;
4679 } else {
4680 currentNode = xmlStrdup(ctxt->name);
4681 if (currentNode == NULL) {
4682 htmlErrMemory(ctxt);
4683 break;
4684 }
4685 }
4686 depth = ctxt->nameNr;
4687 }
4688 else if (CUR == '<') {
4689 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4690 (ctxt->sax->characters != NULL))
4691 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4692 NEXT;
4693 }
4694
4695 /*
4696 * Fourth case : a reference. If if has not been resolved,
4697 * parsing returns it's Name, create the node
4698 */
4699 else if (CUR == '&') {
4700 htmlParseReference(ctxt);
4701 }
4702
4703 /*
4704 * Fifth case : end of the resource
4705 */
4706 else if (CUR == 0) {
4707 htmlAutoCloseOnEnd(ctxt);
4708 break;
4709 }
4710
4711 /*
4712 * Last case, text. Note that References are handled directly.
4713 */
4714 else {
4715 htmlParseCharData(ctxt);
4716 }
4717
4718 SHRINK;
4719 GROW;
4720 }
4721 if (currentNode != NULL) xmlFree(currentNode);
4722}
4723
4724/**
4725 * htmlParseContent:
4726 * @ctxt: an HTML parser context
4727 *
4728 * Parse a content: comment, sub-element, reference or text.
4729 * This is the entry point when called from parser.c
4730 */
4731
4732void
4733__htmlParseContent(void *ctxt) {
4734 if (ctxt != NULL)
4735 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4736}
4737
4738/**
4739 * htmlParseDocument:
4740 * @ctxt: an HTML parser context
4741 *
4742 * Parse an HTML document and invoke the SAX handlers. This is useful
4743 * if you're only interested in custom SAX callbacks. If you want a
4744 * document tree, use htmlCtxtParseDocument.
4745 *
4746 * Returns 0, -1 in case of error.
4747 */
4748
4749int
4750htmlParseDocument(htmlParserCtxtPtr ctxt) {
4751 xmlDtdPtr dtd;
4752
4753 if ((ctxt == NULL) || (ctxt->input == NULL))
4754 return(-1);
4755
4756 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4757 ctxt->sax->setDocumentLocator(ctxt->userData,
4758 (xmlSAXLocator *) &xmlDefaultSAXLocator);
4759 }
4760
4761 xmlDetectEncoding(ctxt);
4762
4763 /*
4764 * This is wrong but matches long-standing behavior. In most cases,
4765 * a document starting with an XML declaration will specify UTF-8.
4766 */
4767 if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4768 (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4769 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4770
4771 /*
4772 * Wipe out everything which is before the first '<'
4773 */
4774 SKIP_BLANKS;
4775 if (CUR == 0) {
4776 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4777 "Document is empty\n", NULL, NULL);
4778 }
4779
4780 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4781 ctxt->sax->startDocument(ctxt->userData);
4782
4783 /*
4784 * Parse possible comments and PIs before any content
4785 */
4786 while (((CUR == '<') && (NXT(1) == '!') &&
4787 (NXT(2) == '-') && (NXT(3) == '-')) ||
4788 ((CUR == '<') && (NXT(1) == '?'))) {
4789 htmlParseComment(ctxt);
4790 htmlParsePI(ctxt);
4791 SKIP_BLANKS;
4792 }
4793
4794
4795 /*
4796 * Then possibly doc type declaration(s) and more Misc
4797 * (doctypedecl Misc*)?
4798 */
4799 if ((CUR == '<') && (NXT(1) == '!') &&
4800 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4801 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4802 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4803 (UPP(8) == 'E')) {
4804 htmlParseDocTypeDecl(ctxt);
4805 }
4806 SKIP_BLANKS;
4807
4808 /*
4809 * Parse possible comments and PIs before any content
4810 */
4811 while ((PARSER_STOPPED(ctxt) == 0) &&
4812 (((CUR == '<') && (NXT(1) == '!') &&
4813 (NXT(2) == '-') && (NXT(3) == '-')) ||
4814 ((CUR == '<') && (NXT(1) == '?')))) {
4815 htmlParseComment(ctxt);
4816 htmlParsePI(ctxt);
4817 SKIP_BLANKS;
4818 }
4819
4820 /*
4821 * Time to start parsing the tree itself
4822 */
4823 htmlParseContentInternal(ctxt);
4824
4825 /*
4826 * autoclose
4827 */
4828 if (CUR == 0)
4829 htmlAutoCloseOnEnd(ctxt);
4830
4831
4832 /*
4833 * SAX: end of the document processing.
4834 */
4835 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4836 ctxt->sax->endDocument(ctxt->userData);
4837
4838 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4839 dtd = xmlGetIntSubset(ctxt->myDoc);
4840 if (dtd == NULL) {
4841 ctxt->myDoc->intSubset =
4842 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4843 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4844 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4845 if (ctxt->myDoc->intSubset == NULL)
4846 htmlErrMemory(ctxt);
4847 }
4848 }
4849 if (! ctxt->wellFormed) return(-1);
4850 return(0);
4851}
4852
4853
4854/************************************************************************
4855 * *
4856 * Parser contexts handling *
4857 * *
4858 ************************************************************************/
4859
4860/**
4861 * htmlInitParserCtxt:
4862 * @ctxt: an HTML parser context
4863 * @sax: SAX handler
4864 * @userData: user data
4865 *
4866 * Initialize a parser context
4867 *
4868 * Returns 0 in case of success and -1 in case of error
4869 */
4870
4871static int
4872htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4873 void *userData)
4874{
4875 if (ctxt == NULL) return(-1);
4876 memset(ctxt, 0, sizeof(htmlParserCtxt));
4877
4878 ctxt->dict = xmlDictCreate();
4879 if (ctxt->dict == NULL)
4880 return(-1);
4881
4882 if (ctxt->sax == NULL)
4883 ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4884 if (ctxt->sax == NULL)
4885 return(-1);
4886 if (sax == NULL) {
4887 memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4888 xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4889 ctxt->userData = ctxt;
4890 } else {
4891 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4892 ctxt->userData = userData ? userData : ctxt;
4893 }
4894
4895 /* Allocate the Input stack */
4896 ctxt->inputTab = (htmlParserInputPtr *)
4897 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4898 if (ctxt->inputTab == NULL)
4899 return(-1);
4900 ctxt->inputNr = 0;
4901 ctxt->inputMax = 5;
4902 ctxt->input = NULL;
4903 ctxt->version = NULL;
4904 ctxt->encoding = NULL;
4905 ctxt->standalone = -1;
4906 ctxt->instate = XML_PARSER_START;
4907
4908 /* Allocate the Node stack */
4909 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4910 if (ctxt->nodeTab == NULL)
4911 return(-1);
4912 ctxt->nodeNr = 0;
4913 ctxt->nodeMax = 10;
4914 ctxt->node = NULL;
4915
4916 /* Allocate the Name stack */
4917 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4918 if (ctxt->nameTab == NULL)
4919 return(-1);
4920 ctxt->nameNr = 0;
4921 ctxt->nameMax = 10;
4922 ctxt->name = NULL;
4923
4924 ctxt->nodeInfoTab = NULL;
4925 ctxt->nodeInfoNr = 0;
4926 ctxt->nodeInfoMax = 0;
4927
4928 ctxt->myDoc = NULL;
4929 ctxt->wellFormed = 1;
4930 ctxt->replaceEntities = 0;
4931 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4932 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4933 ctxt->html = 1;
4934 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
4935 ctxt->vctxt.userData = ctxt;
4936 ctxt->vctxt.error = xmlParserValidityError;
4937 ctxt->vctxt.warning = xmlParserValidityWarning;
4938 ctxt->record_info = 0;
4939 ctxt->validate = 0;
4940 ctxt->checkIndex = 0;
4941 ctxt->catalogs = NULL;
4942 xmlInitNodeInfoSeq(&ctxt->node_seq);
4943 return(0);
4944}
4945
4946/**
4947 * htmlFreeParserCtxt:
4948 * @ctxt: an HTML parser context
4949 *
4950 * Free all the memory used by a parser context. However the parsed
4951 * document in ctxt->myDoc is not freed.
4952 */
4953
4954void
4955htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4956{
4957 xmlFreeParserCtxt(ctxt);
4958}
4959
4960/**
4961 * htmlNewParserCtxt:
4962 *
4963 * Allocate and initialize a new HTML parser context.
4964 *
4965 * This can be used to parse HTML documents into DOM trees with
4966 * functions like xmlCtxtReadFile or xmlCtxtReadMemory.
4967 *
4968 * See htmlCtxtUseOptions for parser options.
4969 *
4970 * See xmlCtxtSetErrorHandler for advanced error handling.
4971 *
4972 * See xmlNewInputURL, xmlNewInputMemory, xmlNewInputIO and similar
4973 * functions for advanced input control.
4974 *
4975 * See htmlNewSAXParserCtxt for custom SAX parsers.
4976 *
4977 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4978 */
4979
4980htmlParserCtxtPtr
4981htmlNewParserCtxt(void)
4982{
4983 return(htmlNewSAXParserCtxt(NULL, NULL));
4984}
4985
4986/**
4987 * htmlNewSAXParserCtxt:
4988 * @sax: SAX handler
4989 * @userData: user data
4990 *
4991 * Allocate and initialize a new HTML SAX parser context. If userData
4992 * is NULL, the parser context will be passed as user data.
4993 *
4994 * Available since 2.11.0. If you want support older versions,
4995 * it's best to invoke htmlNewParserCtxt and set ctxt->sax with
4996 * struct assignment.
4997 *
4998 * Also see htmlNewParserCtxt.
4999 *
5000 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5001 */
5002
5003htmlParserCtxtPtr
5004htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
5005{
5006 xmlParserCtxtPtr ctxt;
5007
5008 xmlInitParser();
5009
5010 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5011 if (ctxt == NULL)
5012 return(NULL);
5013 memset(ctxt, 0, sizeof(xmlParserCtxt));
5014 if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5015 htmlFreeParserCtxt(ctxt);
5016 return(NULL);
5017 }
5018 return(ctxt);
5019}
5020
5021static htmlParserCtxtPtr
5022htmlCreateMemoryParserCtxtInternal(const char *url,
5023 const char *buffer, size_t size,
5024 const char *encoding) {
5025 xmlParserCtxtPtr ctxt;
5026 xmlParserInputPtr input;
5027
5028 if (buffer == NULL)
5029 return(NULL);
5030
5031 ctxt = htmlNewParserCtxt();
5032 if (ctxt == NULL)
5033 return(NULL);
5034
5035 input = xmlNewInputMemory(ctxt, url, buffer, size, encoding, 0);
5036 if (input == NULL) {
5037 xmlFreeParserCtxt(ctxt);
5038 return(NULL);
5039 }
5040
5041 inputPush(ctxt, input);
5042
5043 return(ctxt);
5044}
5045
5046/**
5047 * htmlCreateMemoryParserCtxt:
5048 * @buffer: a pointer to a char array
5049 * @size: the size of the array
5050 *
5051 * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadMemory.
5052 *
5053 * Create a parser context for an HTML in-memory document. The input
5054 * buffer must not contain any terminating null bytes.
5055 *
5056 * Returns the new parser context or NULL
5057 */
5058htmlParserCtxtPtr
5059htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5060 if (size <= 0)
5061 return(NULL);
5062
5063 return(htmlCreateMemoryParserCtxtInternal(NULL, buffer, size, NULL));
5064}
5065
5066/**
5067 * htmlCreateDocParserCtxt:
5068 * @str: a pointer to an array of xmlChar
5069 * @encoding: encoding (optional)
5070 *
5071 * Create a parser context for a null-terminated string.
5072 *
5073 * Returns the new parser context or NULL if a memory allocation failed.
5074 */
5075static htmlParserCtxtPtr
5076htmlCreateDocParserCtxt(const xmlChar *str, const char *url,
5077 const char *encoding) {
5078 xmlParserCtxtPtr ctxt;
5079 xmlParserInputPtr input;
5080
5081 if (str == NULL)
5082 return(NULL);
5083
5084 ctxt = htmlNewParserCtxt();
5085 if (ctxt == NULL)
5086 return(NULL);
5087
5088 input = xmlNewInputString(ctxt, url, (const char *) str, encoding, 0);
5089 if (input == NULL) {
5090 xmlFreeParserCtxt(ctxt);
5091 return(NULL);
5092 }
5093
5094 inputPush(ctxt, input);
5095
5096 return(ctxt);
5097}
5098
5099#ifdef LIBXML_PUSH_ENABLED
5100/************************************************************************
5101 * *
5102 * Progressive parsing interfaces *
5103 * *
5104 ************************************************************************/
5105
5106/**
5107 * htmlParseLookupSequence:
5108 * @ctxt: an HTML parser context
5109 * @first: the first char to lookup
5110 * @next: the next char to lookup or zero
5111 * @third: the next char to lookup or zero
5112 * @ignoreattrval: skip over attribute values
5113 *
5114 * Try to find if a sequence (first, next, third) or just (first next) or
5115 * (first) is available in the input stream.
5116 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5117 * to avoid rescanning sequences of bytes, it DOES change the state of the
5118 * parser, do not use liberally.
5119 * This is basically similar to xmlParseLookupSequence()
5120 *
5121 * Returns the index to the current parsing point if the full sequence
5122 * is available, -1 otherwise.
5123 */
5124static int
5125htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5126 xmlChar next, xmlChar third, int ignoreattrval)
5127{
5128 size_t base, len;
5129 htmlParserInputPtr in;
5130 const xmlChar *buf;
5131 int quote;
5132
5133 in = ctxt->input;
5134 if (in == NULL)
5135 return (-1);
5136
5137 base = ctxt->checkIndex;
5138 quote = ctxt->endCheckState;
5139
5140 buf = in->cur;
5141 len = in->end - in->cur;
5142
5143 /* take into account the sequence length */
5144 if (third)
5145 len -= 2;
5146 else if (next)
5147 len--;
5148 for (; base < len; base++) {
5149 if (base >= INT_MAX / 2) {
5150 ctxt->checkIndex = 0;
5151 ctxt->endCheckState = 0;
5152 return (base - 2);
5153 }
5154 if (ignoreattrval) {
5155 if (quote) {
5156 if (buf[base] == quote)
5157 quote = 0;
5158 continue;
5159 }
5160 if (buf[base] == '"' || buf[base] == '\'') {
5161 quote = buf[base];
5162 continue;
5163 }
5164 }
5165 if (buf[base] == first) {
5166 if (third != 0) {
5167 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5168 continue;
5169 } else if (next != 0) {
5170 if (buf[base + 1] != next)
5171 continue;
5172 }
5173 ctxt->checkIndex = 0;
5174 ctxt->endCheckState = 0;
5175 return (base);
5176 }
5177 }
5178 ctxt->checkIndex = base;
5179 ctxt->endCheckState = quote;
5180 return (-1);
5181}
5182
5183/**
5184 * htmlParseLookupCommentEnd:
5185 * @ctxt: an HTML parser context
5186 *
5187 * Try to find a comment end tag in the input stream
5188 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5189 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5190 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5191 * to avoid rescanning sequences of bytes, it DOES change the state of the
5192 * parser, do not use liberally.
5193 * This wraps to htmlParseLookupSequence()
5194 *
5195 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5196 */
5197static int
5198htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5199{
5200 int mark = 0;
5201 int offset;
5202
5203 while (1) {
5204 mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5205 if (mark < 0)
5206 break;
5207 if ((NXT(mark+2) == '>') ||
5208 ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5209 ctxt->checkIndex = 0;
5210 break;
5211 }
5212 offset = (NXT(mark+2) == '!') ? 3 : 2;
5213 if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
5214 ctxt->checkIndex = mark;
5215 return(-1);
5216 }
5217 ctxt->checkIndex = mark + 1;
5218 }
5219 return mark;
5220}
5221
5222
5223/**
5224 * htmlParseTryOrFinish:
5225 * @ctxt: an HTML parser context
5226 * @terminate: last chunk indicator
5227 *
5228 * Try to progress on parsing
5229 *
5230 * Returns zero if no parsing was possible
5231 */
5232static int
5233htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5234 int ret = 0;
5235 htmlParserInputPtr in;
5236 ptrdiff_t avail = 0;
5237 xmlChar cur, next;
5238
5239 htmlParserNodeInfo node_info;
5240
5241 while (PARSER_STOPPED(ctxt) == 0) {
5242
5243 in = ctxt->input;
5244 if (in == NULL) break;
5245 avail = in->end - in->cur;
5246 if ((avail == 0) && (terminate)) {
5247 htmlAutoCloseOnEnd(ctxt);
5248 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5249 /*
5250 * SAX: end of the document processing.
5251 */
5252 ctxt->instate = XML_PARSER_EOF;
5253 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5254 ctxt->sax->endDocument(ctxt->userData);
5255 }
5256 }
5257 if (avail < 1)
5258 goto done;
5259 /*
5260 * This is done to make progress and avoid an infinite loop
5261 * if a parsing attempt was aborted by hitting a NUL byte. After
5262 * changing htmlCurrentChar, this probably isn't necessary anymore.
5263 * We should consider removing this check.
5264 */
5265 cur = in->cur[0];
5266 if (cur == 0) {
5267 SKIP(1);
5268 continue;
5269 }
5270
5271 switch (ctxt->instate) {
5272 case XML_PARSER_EOF:
5273 /*
5274 * Document parsing is done !
5275 */
5276 goto done;
5277 case XML_PARSER_START:
5278 /*
5279 * This is wrong but matches long-standing behavior. In most
5280 * cases, a document starting with an XML declaration will
5281 * specify UTF-8.
5282 */
5283 if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
5284 (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
5285 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
5286 }
5287
5288 /*
5289 * Very first chars read from the document flow.
5290 */
5291 cur = in->cur[0];
5292 if (IS_BLANK_CH(cur)) {
5293 SKIP_BLANKS;
5294 avail = in->end - in->cur;
5295 }
5296 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
5297 ctxt->sax->setDocumentLocator(ctxt->userData,
5298 (xmlSAXLocator *) &xmlDefaultSAXLocator);
5299 }
5300 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5301 (!ctxt->disableSAX))
5302 ctxt->sax->startDocument(ctxt->userData);
5303
5304 cur = in->cur[0];
5305 next = in->cur[1];
5306 if ((cur == '<') && (next == '!') &&
5307 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5308 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5309 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5310 (UPP(8) == 'E')) {
5311 if ((!terminate) &&
5312 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5313 goto done;
5314 htmlParseDocTypeDecl(ctxt);
5315 ctxt->instate = XML_PARSER_PROLOG;
5316 } else {
5317 ctxt->instate = XML_PARSER_MISC;
5318 }
5319 break;
5320 case XML_PARSER_MISC:
5321 SKIP_BLANKS;
5322 avail = in->end - in->cur;
5323 /*
5324 * no chars in buffer
5325 */
5326 if (avail < 1)
5327 goto done;
5328 /*
5329 * not enough chars in buffer
5330 */
5331 if (avail < 2) {
5332 if (!terminate)
5333 goto done;
5334 else
5335 next = ' ';
5336 } else {
5337 next = in->cur[1];
5338 }
5339 cur = in->cur[0];
5340 if ((cur == '<') && (next == '!') &&
5341 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5342 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5343 goto done;
5344 htmlParseComment(ctxt);
5345 ctxt->instate = XML_PARSER_MISC;
5346 } else if ((cur == '<') && (next == '?')) {
5347 if ((!terminate) &&
5348 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5349 goto done;
5350 htmlParsePI(ctxt);
5351 ctxt->instate = XML_PARSER_MISC;
5352 } else if ((cur == '<') && (next == '!') &&
5353 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5354 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5355 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5356 (UPP(8) == 'E')) {
5357 if ((!terminate) &&
5358 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5359 goto done;
5360 htmlParseDocTypeDecl(ctxt);
5361 ctxt->instate = XML_PARSER_PROLOG;
5362 } else if ((cur == '<') && (next == '!') &&
5363 (avail < 9)) {
5364 goto done;
5365 } else {
5366 ctxt->instate = XML_PARSER_CONTENT;
5367 }
5368 break;
5369 case XML_PARSER_PROLOG:
5370 SKIP_BLANKS;
5371 avail = in->end - in->cur;
5372 if (avail < 2)
5373 goto done;
5374 cur = in->cur[0];
5375 next = in->cur[1];
5376 if ((cur == '<') && (next == '!') &&
5377 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5378 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5379 goto done;
5380 htmlParseComment(ctxt);
5381 ctxt->instate = XML_PARSER_PROLOG;
5382 } else if ((cur == '<') && (next == '?')) {
5383 if ((!terminate) &&
5384 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5385 goto done;
5386 htmlParsePI(ctxt);
5387 ctxt->instate = XML_PARSER_PROLOG;
5388 } else if ((cur == '<') && (next == '!') &&
5389 (avail < 4)) {
5390 goto done;
5391 } else {
5392 ctxt->instate = XML_PARSER_CONTENT;
5393 }
5394 break;
5395 case XML_PARSER_EPILOG:
5396 avail = in->end - in->cur;
5397 if (avail < 1)
5398 goto done;
5399 cur = in->cur[0];
5400 if (IS_BLANK_CH(cur)) {
5401 htmlParseCharData(ctxt);
5402 goto done;
5403 }
5404 if (avail < 2)
5405 goto done;
5406 next = in->cur[1];
5407 if ((cur == '<') && (next == '!') &&
5408 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5409 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5410 goto done;
5411 htmlParseComment(ctxt);
5412 ctxt->instate = XML_PARSER_EPILOG;
5413 } else if ((cur == '<') && (next == '?')) {
5414 if ((!terminate) &&
5415 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5416 goto done;
5417 htmlParsePI(ctxt);
5418 ctxt->instate = XML_PARSER_EPILOG;
5419 } else if ((cur == '<') && (next == '!') &&
5420 (avail < 4)) {
5421 goto done;
5422 } else {
5423 ctxt->errNo = XML_ERR_DOCUMENT_END;
5424 ctxt->wellFormed = 0;
5425 ctxt->instate = XML_PARSER_EOF;
5426 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5427 ctxt->sax->endDocument(ctxt->userData);
5428 goto done;
5429 }
5430 break;
5431 case XML_PARSER_START_TAG: {
5432 const xmlChar *name;
5433 int failed;
5434 const htmlElemDesc * info;
5435
5436 /*
5437 * no chars in buffer
5438 */
5439 if (avail < 1)
5440 goto done;
5441 /*
5442 * not enough chars in buffer
5443 */
5444 if (avail < 2) {
5445 if (!terminate)
5446 goto done;
5447 else
5448 next = ' ';
5449 } else {
5450 next = in->cur[1];
5451 }
5452 cur = in->cur[0];
5453 if (cur != '<') {
5454 ctxt->instate = XML_PARSER_CONTENT;
5455 break;
5456 }
5457 if (next == '/') {
5458 ctxt->instate = XML_PARSER_END_TAG;
5459 ctxt->checkIndex = 0;
5460 break;
5461 }
5462 if ((!terminate) &&
5463 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5464 goto done;
5465
5466 /* Capture start position */
5467 if (ctxt->record_info) {
5468 node_info.begin_pos = ctxt->input->consumed +
5469 (CUR_PTR - ctxt->input->base);
5470 node_info.begin_line = ctxt->input->line;
5471 }
5472
5473
5474 failed = htmlParseStartTag(ctxt);
5475 name = ctxt->name;
5476 if ((failed == -1) ||
5477 (name == NULL)) {
5478 if (CUR == '>')
5479 NEXT;
5480 break;
5481 }
5482
5483 /*
5484 * Lookup the info for that element.
5485 */
5486 info = htmlTagLookup(name);
5487 if (info == NULL) {
5488 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5489 "Tag %s invalid\n", name, NULL);
5490 }
5491
5492 /*
5493 * Check for an Empty Element labeled the XML/SGML way
5494 */
5495 if ((CUR == '/') && (NXT(1) == '>')) {
5496 SKIP(2);
5497 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5498 ctxt->sax->endElement(ctxt->userData, name);
5499 htmlnamePop(ctxt);
5500 ctxt->instate = XML_PARSER_CONTENT;
5501 break;
5502 }
5503
5504 if (CUR == '>') {
5505 NEXT;
5506 } else {
5507 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5508 "Couldn't find end of Start Tag %s\n",
5509 name, NULL);
5510
5511 /*
5512 * end of parsing of this node.
5513 */
5514 if (xmlStrEqual(name, ctxt->name)) {
5515 nodePop(ctxt);
5516 htmlnamePop(ctxt);
5517 }
5518
5519 if (ctxt->record_info)
5520 htmlNodeInfoPush(ctxt, &node_info);
5521
5522 ctxt->instate = XML_PARSER_CONTENT;
5523 break;
5524 }
5525
5526 /*
5527 * Check for an Empty Element from DTD definition
5528 */
5529 if ((info != NULL) && (info->empty)) {
5530 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5531 ctxt->sax->endElement(ctxt->userData, name);
5532 htmlnamePop(ctxt);
5533 }
5534
5535 if (ctxt->record_info)
5536 htmlNodeInfoPush(ctxt, &node_info);
5537
5538 ctxt->instate = XML_PARSER_CONTENT;
5539 break;
5540 }
5541 case XML_PARSER_CONTENT: {
5542 xmlChar chr[2] = { 0, 0 };
5543
5544 /*
5545 * Handle preparsed entities and charRef
5546 */
5547 if ((avail == 1) && (terminate)) {
5548 cur = in->cur[0];
5549 if ((cur != '<') && (cur != '&')) {
5550 if (ctxt->sax != NULL) {
5551 chr[0] = cur;
5552 if (IS_BLANK_CH(cur)) {
5553 if (ctxt->keepBlanks) {
5554 if (ctxt->sax->characters != NULL)
5555 ctxt->sax->characters(
5556 ctxt->userData, chr, 1);
5557 } else {
5558 if (ctxt->sax->ignorableWhitespace != NULL)
5559 ctxt->sax->ignorableWhitespace(
5560 ctxt->userData, chr, 1);
5561 }
5562 } else {
5563 htmlCheckParagraph(ctxt);
5564 if (ctxt->sax->characters != NULL)
5565 ctxt->sax->characters(
5566 ctxt->userData, chr, 1);
5567 }
5568 }
5569 ctxt->checkIndex = 0;
5570 in->cur++;
5571 break;
5572 }
5573 }
5574 if (avail < 2)
5575 goto done;
5576 cur = in->cur[0];
5577 next = in->cur[1];
5578 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5579 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5580 /*
5581 * Handle SCRIPT/STYLE separately
5582 */
5583 if (!terminate) {
5584 int idx;
5585 xmlChar val;
5586
5587 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5588 if (idx < 0)
5589 goto done;
5590 val = in->cur[idx + 2];
5591 if (val == 0) { /* bad cut of input */
5592 /*
5593 * FIXME: htmlParseScript checks for additional
5594 * characters after '</'.
5595 */
5596 ctxt->checkIndex = idx;
5597 goto done;
5598 }
5599 }
5600 htmlParseScript(ctxt);
5601 if ((cur == '<') && (next == '/')) {
5602 ctxt->instate = XML_PARSER_END_TAG;
5603 ctxt->checkIndex = 0;
5604 break;
5605 }
5606 } else if ((cur == '<') && (next == '!')) {
5607 if (avail < 4)
5608 goto done;
5609 /*
5610 * Sometimes DOCTYPE arrives in the middle of the document
5611 */
5612 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5613 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5614 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5615 (UPP(8) == 'E')) {
5616 if ((!terminate) &&
5617 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5618 goto done;
5619 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5620 "Misplaced DOCTYPE declaration\n",
5621 BAD_CAST "DOCTYPE" , NULL);
5622 htmlParseDocTypeDecl(ctxt);
5623 } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5624 if ((!terminate) &&
5625 (htmlParseLookupCommentEnd(ctxt) < 0))
5626 goto done;
5627 htmlParseComment(ctxt);
5628 ctxt->instate = XML_PARSER_CONTENT;
5629 } else {
5630 if ((!terminate) &&
5631 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5632 goto done;
5633 htmlSkipBogusComment(ctxt);
5634 }
5635 } else if ((cur == '<') && (next == '?')) {
5636 if ((!terminate) &&
5637 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5638 goto done;
5639 htmlParsePI(ctxt);
5640 ctxt->instate = XML_PARSER_CONTENT;
5641 } else if ((cur == '<') && (next == '/')) {
5642 ctxt->instate = XML_PARSER_END_TAG;
5643 ctxt->checkIndex = 0;
5644 break;
5645 } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
5646 if ((!terminate) && (next == 0))
5647 goto done;
5648 ctxt->instate = XML_PARSER_START_TAG;
5649 ctxt->checkIndex = 0;
5650 break;
5651 } else if (cur == '<') {
5652 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5653 (ctxt->sax->characters != NULL))
5654 ctxt->sax->characters(ctxt->userData,
5655 BAD_CAST "<", 1);
5656 NEXT;
5657 } else {
5658 /*
5659 * check that the text sequence is complete
5660 * before handing out the data to the parser
5661 * to avoid problems with erroneous end of
5662 * data detection.
5663 */
5664 if ((!terminate) &&
5665 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5666 goto done;
5667 ctxt->checkIndex = 0;
5668 while ((PARSER_STOPPED(ctxt) == 0) &&
5669 (cur != '<') && (in->cur < in->end)) {
5670 if (cur == '&') {
5671 htmlParseReference(ctxt);
5672 } else {
5673 htmlParseCharData(ctxt);
5674 }
5675 cur = in->cur[0];
5676 }
5677 }
5678
5679 break;
5680 }
5681 case XML_PARSER_END_TAG:
5682 if (avail < 2)
5683 goto done;
5684 if ((!terminate) &&
5685 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5686 goto done;
5687 htmlParseEndTag(ctxt);
5688 if (ctxt->nameNr == 0) {
5689 ctxt->instate = XML_PARSER_EPILOG;
5690 } else {
5691 ctxt->instate = XML_PARSER_CONTENT;
5692 }
5693 ctxt->checkIndex = 0;
5694 break;
5695 default:
5696 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5697 "HPP: internal error\n", NULL, NULL);
5698 ctxt->instate = XML_PARSER_EOF;
5699 break;
5700 }
5701 }
5702done:
5703 if ((avail == 0) && (terminate)) {
5704 htmlAutoCloseOnEnd(ctxt);
5705 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5706 /*
5707 * SAX: end of the document processing.
5708 */
5709 ctxt->instate = XML_PARSER_EOF;
5710 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5711 ctxt->sax->endDocument(ctxt->userData);
5712 }
5713 }
5714 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5715 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5716 (ctxt->instate == XML_PARSER_EPILOG))) {
5717 xmlDtdPtr dtd;
5718 dtd = xmlGetIntSubset(ctxt->myDoc);
5719 if (dtd == NULL) {
5720 ctxt->myDoc->intSubset =
5721 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5722 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5723 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5724 if (ctxt->myDoc->intSubset == NULL)
5725 htmlErrMemory(ctxt);
5726 }
5727 }
5728 return(ret);
5729}
5730
5731/**
5732 * htmlParseChunk:
5733 * @ctxt: an HTML parser context
5734 * @chunk: chunk of memory
5735 * @size: size of chunk in bytes
5736 * @terminate: last chunk indicator
5737 *
5738 * Parse a chunk of memory in push parser mode.
5739 *
5740 * Assumes that the parser context was initialized with
5741 * htmlCreatePushParserCtxt.
5742 *
5743 * The last chunk, which will often be empty, must be marked with
5744 * the @terminate flag. With the default SAX callbacks, the resulting
5745 * document will be available in ctxt->myDoc. This pointer will not
5746 * be freed by the library.
5747 *
5748 * If the document isn't well-formed, ctxt->myDoc is set to NULL.
5749 *
5750 * Returns an xmlParserErrors code (0 on success).
5751 */
5752int
5753htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5754 int terminate) {
5755 if ((ctxt == NULL) || (ctxt->input == NULL))
5756 return(XML_ERR_ARGUMENT);
5757 if (PARSER_STOPPED(ctxt) != 0)
5758 return(ctxt->errNo);
5759 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5760 (ctxt->input->buf != NULL)) {
5761 size_t pos = ctxt->input->cur - ctxt->input->base;
5762 int res;
5763
5764 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5765 xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5766 if (res < 0) {
5767 htmlParseErr(ctxt, ctxt->input->buf->error,
5768 "xmlParserInputBufferPush failed", NULL, NULL);
5769 xmlHaltParser(ctxt);
5770 return (ctxt->errNo);
5771 }
5772 }
5773 htmlParseTryOrFinish(ctxt, terminate);
5774 if (terminate) {
5775 if (ctxt->instate != XML_PARSER_EOF) {
5776 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5777 ctxt->sax->endDocument(ctxt->userData);
5778 }
5779 ctxt->instate = XML_PARSER_EOF;
5780 }
5781 return((xmlParserErrors) ctxt->errNo);
5782}
5783
5784/************************************************************************
5785 * *
5786 * User entry points *
5787 * *
5788 ************************************************************************/
5789
5790/**
5791 * htmlCreatePushParserCtxt:
5792 * @sax: a SAX handler (optional)
5793 * @user_data: The user data returned on SAX callbacks (optional)
5794 * @chunk: a pointer to an array of chars (optional)
5795 * @size: number of chars in the array
5796 * @filename: only used for error reporting (optional)
5797 * @enc: encoding (deprecated, pass XML_CHAR_ENCODING_NONE)
5798 *
5799 * Create a parser context for using the HTML parser in push mode.
5800 *
5801 * Returns the new parser context or NULL if a memory allocation
5802 * failed.
5803 */
5804htmlParserCtxtPtr
5805htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5806 const char *chunk, int size, const char *filename,
5807 xmlCharEncoding enc) {
5808 htmlParserCtxtPtr ctxt;
5809 htmlParserInputPtr input;
5810 const char *encoding;
5811
5812 ctxt = htmlNewSAXParserCtxt(sax, user_data);
5813 if (ctxt == NULL)
5814 return(NULL);
5815
5816 encoding = xmlGetCharEncodingName(enc);
5817 input = xmlNewInputPush(ctxt, filename, chunk, size, encoding);
5818 if (input == NULL) {
5819 htmlFreeParserCtxt(ctxt);
5820 return(NULL);
5821 }
5822 inputPush(ctxt, input);
5823
5824 return(ctxt);
5825}
5826#endif /* LIBXML_PUSH_ENABLED */
5827
5828/**
5829 * htmlSAXParseDoc:
5830 * @cur: a pointer to an array of xmlChar
5831 * @encoding: a free form C string describing the HTML document encoding, or NULL
5832 * @sax: the SAX handler block
5833 * @userData: if using SAX, this pointer will be provided on callbacks.
5834 *
5835 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
5836 *
5837 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5838 * to handle parse events. If sax is NULL, fallback to the default DOM
5839 * behavior and return a tree.
5840 *
5841 * Returns the resulting document tree unless SAX is NULL or the document is
5842 * not well formed.
5843 */
5844
5845htmlDocPtr
5846htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
5847 htmlSAXHandlerPtr sax, void *userData) {
5848 htmlDocPtr ret;
5849 htmlParserCtxtPtr ctxt;
5850
5851 if (cur == NULL)
5852 return(NULL);
5853
5854 ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding);
5855 if (ctxt == NULL)
5856 return(NULL);
5857
5858 if (sax != NULL) {
5859 *ctxt->sax = *sax;
5860 ctxt->userData = userData;
5861 }
5862
5863 htmlParseDocument(ctxt);
5864 ret = ctxt->myDoc;
5865 htmlFreeParserCtxt(ctxt);
5866
5867 return(ret);
5868}
5869
5870/**
5871 * htmlParseDoc:
5872 * @cur: a pointer to an array of xmlChar
5873 * @encoding: the encoding (optional)
5874 *
5875 * DEPRECATED: Use htmlReadDoc.
5876 *
5877 * Parse an HTML in-memory document and build a tree.
5878 *
5879 * This function uses deprecated global parser options.
5880 *
5881 * Returns the resulting document tree
5882 */
5883
5884htmlDocPtr
5885htmlParseDoc(const xmlChar *cur, const char *encoding) {
5886 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5887}
5888
5889
5890/**
5891 * htmlCreateFileParserCtxt:
5892 * @filename: the filename
5893 * @encoding: optional encoding
5894 *
5895 * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadFile.
5896 *
5897 * Create a parser context to read from a file.
5898 *
5899 * A non-NULL encoding overrides encoding declarations in the document.
5900 *
5901 * Automatic support for ZLIB/Compress compressed document is provided
5902 * by default if found at compile-time.
5903 *
5904 * Returns the new parser context or NULL if a memory allocation failed.
5905 */
5906htmlParserCtxtPtr
5907htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5908{
5909 htmlParserCtxtPtr ctxt;
5910 htmlParserInputPtr input;
5911
5912 if (filename == NULL)
5913 return(NULL);
5914
5915 ctxt = htmlNewParserCtxt();
5916 if (ctxt == NULL) {
5917 return(NULL);
5918 }
5919
5920 input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
5921 if (input == NULL) {
5922 xmlFreeParserCtxt(ctxt);
5923 return(NULL);
5924 }
5925 inputPush(ctxt, input);
5926
5927 return(ctxt);
5928}
5929
5930/**
5931 * htmlSAXParseFile:
5932 * @filename: the filename
5933 * @encoding: encoding (optional)
5934 * @sax: the SAX handler block
5935 * @userData: if using SAX, this pointer will be provided on callbacks.
5936 *
5937 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
5938 *
5939 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5940 * compressed document is provided by default if found at compile-time.
5941 * It use the given SAX function block to handle the parsing callback.
5942 * If sax is NULL, fallback to the default DOM tree building routines.
5943 *
5944 * Returns the resulting document tree unless SAX is NULL or the document is
5945 * not well formed.
5946 */
5947
5948htmlDocPtr
5949htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5950 void *userData) {
5951 htmlDocPtr ret;
5952 htmlParserCtxtPtr ctxt;
5953 htmlSAXHandlerPtr oldsax = NULL;
5954
5955 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5956 if (ctxt == NULL) return(NULL);
5957 if (sax != NULL) {
5958 oldsax = ctxt->sax;
5959 ctxt->sax = sax;
5960 ctxt->userData = userData;
5961 }
5962
5963 htmlParseDocument(ctxt);
5964
5965 ret = ctxt->myDoc;
5966 if (sax != NULL) {
5967 ctxt->sax = oldsax;
5968 ctxt->userData = NULL;
5969 }
5970 htmlFreeParserCtxt(ctxt);
5971
5972 return(ret);
5973}
5974
5975/**
5976 * htmlParseFile:
5977 * @filename: the filename
5978 * @encoding: encoding (optional)
5979 *
5980 * Parse an HTML file and build a tree.
5981 *
5982 * See xmlNewInputURL for details.
5983 *
5984 * Returns the resulting document tree
5985 */
5986
5987htmlDocPtr
5988htmlParseFile(const char *filename, const char *encoding) {
5989 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5990}
5991
5992/**
5993 * htmlHandleOmittedElem:
5994 * @val: int 0 or 1
5995 *
5996 * DEPRECATED: Use HTML_PARSE_NOIMPLIED
5997 *
5998 * Set and return the previous value for handling HTML omitted tags.
5999 *
6000 * Returns the last value for 0 for no handling, 1 for auto insertion.
6001 */
6002
6003int
6004htmlHandleOmittedElem(int val) {
6005 int old = htmlOmittedDefaultValue;
6006
6007 htmlOmittedDefaultValue = val;
6008 return(old);
6009}
6010
6011/**
6012 * htmlElementAllowedHere:
6013 * @parent: HTML parent element
6014 * @elt: HTML element
6015 *
6016 * Checks whether an HTML element may be a direct child of a parent element.
6017 * Note - doesn't check for deprecated elements
6018 *
6019 * Returns 1 if allowed; 0 otherwise.
6020 */
6021int
6022htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6023 const char** p ;
6024
6025 if ( ! elt || ! parent || ! parent->subelts )
6026 return 0 ;
6027
6028 for ( p = parent->subelts; *p; ++p )
6029 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6030 return 1 ;
6031
6032 return 0 ;
6033}
6034/**
6035 * htmlElementStatusHere:
6036 * @parent: HTML parent element
6037 * @elt: HTML element
6038 *
6039 * Checks whether an HTML element may be a direct child of a parent element.
6040 * and if so whether it is valid or deprecated.
6041 *
6042 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6043 */
6044htmlStatus
6045htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6046 if ( ! parent || ! elt )
6047 return HTML_INVALID ;
6048 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6049 return HTML_INVALID ;
6050
6051 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6052}
6053/**
6054 * htmlAttrAllowed:
6055 * @elt: HTML element
6056 * @attr: HTML attribute
6057 * @legacy: whether to allow deprecated attributes
6058 *
6059 * Checks whether an attribute is valid for an element
6060 * Has full knowledge of Required and Deprecated attributes
6061 *
6062 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6063 */
6064htmlStatus
6065htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6066 const char** p ;
6067
6068 if ( !elt || ! attr )
6069 return HTML_INVALID ;
6070
6071 if ( elt->attrs_req )
6072 for ( p = elt->attrs_req; *p; ++p)
6073 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6074 return HTML_REQUIRED ;
6075
6076 if ( elt->attrs_opt )
6077 for ( p = elt->attrs_opt; *p; ++p)
6078 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6079 return HTML_VALID ;
6080
6081 if ( legacy && elt->attrs_depr )
6082 for ( p = elt->attrs_depr; *p; ++p)
6083 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6084 return HTML_DEPRECATED ;
6085
6086 return HTML_INVALID ;
6087}
6088/**
6089 * htmlNodeStatus:
6090 * @node: an htmlNodePtr in a tree
6091 * @legacy: whether to allow deprecated elements (YES is faster here
6092 * for Element nodes)
6093 *
6094 * Checks whether the tree node is valid. Experimental (the author
6095 * only uses the HTML enhancements in a SAX parser)
6096 *
6097 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6098 * legacy allowed) or htmlElementStatusHere (otherwise).
6099 * for Attribute nodes, a return from htmlAttrAllowed
6100 * for other nodes, HTML_NA (no checks performed)
6101 */
6102htmlStatus
6103htmlNodeStatus(htmlNodePtr node, int legacy) {
6104 if ( ! node )
6105 return HTML_INVALID ;
6106
6107 switch ( node->type ) {
6108 case XML_ELEMENT_NODE:
6109 return legacy
6110 ? ( htmlElementAllowedHere (
6111 htmlTagLookup(node->parent->name) , node->name
6112 ) ? HTML_VALID : HTML_INVALID )
6113 : htmlElementStatusHere(
6114 htmlTagLookup(node->parent->name) ,
6115 htmlTagLookup(node->name) )
6116 ;
6117 case XML_ATTRIBUTE_NODE:
6118 return htmlAttrAllowed(
6119 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6120 default: return HTML_NA ;
6121 }
6122}
6123/************************************************************************
6124 * *
6125 * New set (2.6.0) of simpler and more flexible APIs *
6126 * *
6127 ************************************************************************/
6128/**
6129 * DICT_FREE:
6130 * @str: a string
6131 *
6132 * Free a string if it is not owned by the "dict" dictionary in the
6133 * current scope
6134 */
6135#define DICT_FREE(str) \
6136 if ((str) && ((!dict) || \
6137 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6138 xmlFree((char *)(str));
6139
6140/**
6141 * htmlCtxtReset:
6142 * @ctxt: an HTML parser context
6143 *
6144 * Reset a parser context
6145 */
6146void
6147htmlCtxtReset(htmlParserCtxtPtr ctxt)
6148{
6149 xmlParserInputPtr input;
6150 xmlDictPtr dict;
6151
6152 if (ctxt == NULL)
6153 return;
6154
6155 dict = ctxt->dict;
6156
6157 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6158 xmlFreeInputStream(input);
6159 }
6160 ctxt->inputNr = 0;
6161 ctxt->input = NULL;
6162
6163 ctxt->spaceNr = 0;
6164 if (ctxt->spaceTab != NULL) {
6165 ctxt->spaceTab[0] = -1;
6166 ctxt->space = &ctxt->spaceTab[0];
6167 } else {
6168 ctxt->space = NULL;
6169 }
6170
6171
6172 ctxt->nodeNr = 0;
6173 ctxt->node = NULL;
6174
6175 ctxt->nameNr = 0;
6176 ctxt->name = NULL;
6177
6178 ctxt->nsNr = 0;
6179
6180 DICT_FREE(ctxt->version);
6181 ctxt->version = NULL;
6182 DICT_FREE(ctxt->encoding);
6183 ctxt->encoding = NULL;
6184 DICT_FREE(ctxt->extSubURI);
6185 ctxt->extSubURI = NULL;
6186 DICT_FREE(ctxt->extSubSystem);
6187 ctxt->extSubSystem = NULL;
6188
6189 if (ctxt->directory != NULL) {
6190 xmlFree(ctxt->directory);
6191 ctxt->directory = NULL;
6192 }
6193
6194 if (ctxt->myDoc != NULL)
6195 xmlFreeDoc(ctxt->myDoc);
6196 ctxt->myDoc = NULL;
6197
6198 ctxt->standalone = -1;
6199 ctxt->hasExternalSubset = 0;
6200 ctxt->hasPErefs = 0;
6201 ctxt->html = 1;
6202 ctxt->instate = XML_PARSER_START;
6203
6204 ctxt->wellFormed = 1;
6205 ctxt->nsWellFormed = 1;
6206 ctxt->disableSAX = 0;
6207 ctxt->valid = 1;
6208 ctxt->vctxt.userData = ctxt;
6209 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
6210 ctxt->vctxt.error = xmlParserValidityError;
6211 ctxt->vctxt.warning = xmlParserValidityWarning;
6212 ctxt->record_info = 0;
6213 ctxt->checkIndex = 0;
6214 ctxt->endCheckState = 0;
6215 ctxt->inSubset = 0;
6216 ctxt->errNo = XML_ERR_OK;
6217 ctxt->depth = 0;
6218 ctxt->catalogs = NULL;
6219 xmlInitNodeInfoSeq(&ctxt->node_seq);
6220
6221 if (ctxt->attsDefault != NULL) {
6222 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6223 ctxt->attsDefault = NULL;
6224 }
6225 if (ctxt->attsSpecial != NULL) {
6226 xmlHashFree(ctxt->attsSpecial, NULL);
6227 ctxt->attsSpecial = NULL;
6228 }
6229
6230 ctxt->nbErrors = 0;
6231 ctxt->nbWarnings = 0;
6232 if (ctxt->lastError.code != XML_ERR_OK)
6233 xmlResetError(&ctxt->lastError);
6234}
6235
6236/**
6237 * htmlCtxtUseOptions:
6238 * @ctxt: an HTML parser context
6239 * @options: a combination of htmlParserOption(s)
6240 *
6241 * Applies the options to the parser context
6242 *
6243 * Returns 0 in case of success, the set of unknown or unimplemented options
6244 * in case of error.
6245 */
6246int
6247htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6248{
6249 if (ctxt == NULL)
6250 return(-1);
6251
6252 if (options & HTML_PARSE_NOWARNING) {
6253 ctxt->sax->warning = NULL;
6254 ctxt->vctxt.warning = NULL;
6255 options -= XML_PARSE_NOWARNING;
6256 ctxt->options |= XML_PARSE_NOWARNING;
6257 }
6258 if (options & HTML_PARSE_NOERROR) {
6259 ctxt->sax->error = NULL;
6260 ctxt->vctxt.error = NULL;
6261 ctxt->sax->fatalError = NULL;
6262 options -= XML_PARSE_NOERROR;
6263 ctxt->options |= XML_PARSE_NOERROR;
6264 }
6265 if (options & HTML_PARSE_PEDANTIC) {
6266 ctxt->pedantic = 1;
6267 options -= XML_PARSE_PEDANTIC;
6268 ctxt->options |= XML_PARSE_PEDANTIC;
6269 } else
6270 ctxt->pedantic = 0;
6271 if (options & XML_PARSE_NOBLANKS) {
6272 ctxt->keepBlanks = 0;
6273 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6274 options -= XML_PARSE_NOBLANKS;
6275 ctxt->options |= XML_PARSE_NOBLANKS;
6276 } else
6277 ctxt->keepBlanks = 1;
6278 if (options & HTML_PARSE_RECOVER) {
6279 ctxt->recovery = 1;
6280 options -= HTML_PARSE_RECOVER;
6281 } else
6282 ctxt->recovery = 0;
6283 if (options & HTML_PARSE_COMPACT) {
6284 ctxt->options |= HTML_PARSE_COMPACT;
6285 options -= HTML_PARSE_COMPACT;
6286 }
6287 if (options & XML_PARSE_HUGE) {
6288 ctxt->options |= XML_PARSE_HUGE;
6289 options -= XML_PARSE_HUGE;
6290 }
6291 if (options & HTML_PARSE_NODEFDTD) {
6292 ctxt->options |= HTML_PARSE_NODEFDTD;
6293 options -= HTML_PARSE_NODEFDTD;
6294 }
6295 if (options & HTML_PARSE_IGNORE_ENC) {
6296 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6297 options -= HTML_PARSE_IGNORE_ENC;
6298 }
6299 if (options & HTML_PARSE_NOIMPLIED) {
6300 ctxt->options |= HTML_PARSE_NOIMPLIED;
6301 options -= HTML_PARSE_NOIMPLIED;
6302 }
6303 ctxt->dictNames = 0;
6304 ctxt->linenumbers = 1;
6305 return (options);
6306}
6307
6308/**
6309 * htmlCtxtParseDocument:
6310 * @ctxt: an HTML parser context
6311 * @input: parser input
6312 *
6313 * Parse an HTML document and return the resulting document tree.
6314 *
6315 * Available since 2.13.0.
6316 *
6317 * Returns the resulting document tree or NULL
6318 */
6319htmlDocPtr
6320htmlCtxtParseDocument(htmlParserCtxtPtr ctxt, xmlParserInputPtr input)
6321{
6322 htmlDocPtr ret;
6323
6324 if ((ctxt == NULL) || (input == NULL))
6325 return(NULL);
6326
6327 /* assert(ctxt->inputNr == 0); */
6328 while (ctxt->inputNr > 0)
6329 xmlFreeInputStream(inputPop(ctxt));
6330
6331 if (inputPush(ctxt, input) < 0) {
6332 xmlFreeInputStream(input);
6333 return(NULL);
6334 }
6335
6336 ctxt->html = 1;
6337 htmlParseDocument(ctxt);
6338
6339 if (ctxt->errNo != XML_ERR_NO_MEMORY) {
6340 ret = ctxt->myDoc;
6341 } else {
6342 ret = NULL;
6343 xmlFreeDoc(ctxt->myDoc);
6344 }
6345 ctxt->myDoc = NULL;
6346
6347 /* assert(ctxt->inputNr == 1); */
6348 while (ctxt->inputNr > 0)
6349 xmlFreeInputStream(inputPop(ctxt));
6350
6351 return(ret);
6352}
6353
6354/**
6355 * htmlReadDoc:
6356 * @str: a pointer to a zero terminated string
6357 * @url: only used for error reporting (optoinal)
6358 * @encoding: the document encoding (optional)
6359 * @options: a combination of htmlParserOptions
6360 *
6361 * Convenience function to parse an HTML document from a zero-terminated
6362 * string.
6363 *
6364 * See htmlCtxtReadDoc for details.
6365 *
6366 * Returns the resulting document tree.
6367 */
6368htmlDocPtr
6369htmlReadDoc(const xmlChar *str, const char *url, const char *encoding,
6370 int options)
6371{
6372 htmlParserCtxtPtr ctxt;
6373 xmlParserInputPtr input;
6374 htmlDocPtr doc;
6375
6376 ctxt = htmlNewParserCtxt();
6377 if (ctxt == NULL)
6378 return(NULL);
6379
6380 htmlCtxtUseOptions(ctxt, options);
6381
6382 input = xmlNewInputString(ctxt, url, (const char *) str, encoding,
6383 XML_INPUT_BUF_STATIC);
6384
6385 doc = htmlCtxtParseDocument(ctxt, input);
6386
6387 htmlFreeParserCtxt(ctxt);
6388 return(doc);
6389}
6390
6391/**
6392 * htmlReadFile:
6393 * @filename: a file or URL
6394 * @encoding: the document encoding (optional)
6395 * @options: a combination of htmlParserOptions
6396 *
6397 * Convenience function to parse an HTML file from the filesystem,
6398 * the network or a global user-defined resource loader.
6399 *
6400 * See htmlCtxtReadFile for details.
6401 *
6402 * Returns the resulting document tree.
6403 */
6404htmlDocPtr
6405htmlReadFile(const char *filename, const char *encoding, int options)
6406{
6407 htmlParserCtxtPtr ctxt;
6408 xmlParserInputPtr input;
6409 htmlDocPtr doc;
6410
6411 ctxt = htmlNewParserCtxt();
6412 if (ctxt == NULL)
6413 return(NULL);
6414
6415 htmlCtxtUseOptions(ctxt, options);
6416
6417 input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
6418
6419 doc = htmlCtxtParseDocument(ctxt, input);
6420
6421 htmlFreeParserCtxt(ctxt);
6422 return(doc);
6423}
6424
6425/**
6426 * htmlReadMemory:
6427 * @buffer: a pointer to a char array
6428 * @size: the size of the array
6429 * @url: only used for error reporting (optional)
6430 * @encoding: the document encoding, or NULL
6431 * @options: a combination of htmlParserOption(s)
6432 *
6433 * Convenience function to parse an HTML document from memory.
6434 * The input buffer must not contain any terminating null bytes.
6435 *
6436 * See htmlCtxtReadMemory for details.
6437 *
6438 * Returns the resulting document tree
6439 */
6440htmlDocPtr
6441htmlReadMemory(const char *buffer, int size, const char *url,
6442 const char *encoding, int options)
6443{
6444 htmlParserCtxtPtr ctxt;
6445 xmlParserInputPtr input;
6446 htmlDocPtr doc;
6447
6448 if (size < 0)
6449 return(NULL);
6450
6451 ctxt = htmlNewParserCtxt();
6452 if (ctxt == NULL)
6453 return(NULL);
6454
6455 htmlCtxtUseOptions(ctxt, options);
6456
6457 input = xmlNewInputMemory(ctxt, url, buffer, size, encoding,
6458 XML_INPUT_BUF_STATIC);
6459
6460 doc = htmlCtxtParseDocument(ctxt, input);
6461
6462 htmlFreeParserCtxt(ctxt);
6463 return(doc);
6464}
6465
6466/**
6467 * htmlReadFd:
6468 * @fd: an open file descriptor
6469 * @url: only used for error reporting (optional)
6470 * @encoding: the document encoding, or NULL
6471 * @options: a combination of htmlParserOptions
6472 *
6473 * Convenience function to parse an HTML document from a
6474 * file descriptor.
6475 *
6476 * NOTE that the file descriptor will not be closed when the
6477 * context is freed or reset.
6478 *
6479 * See htmlCtxtReadFd for details.
6480 *
6481 * Returns the resulting document tree
6482 */
6483htmlDocPtr
6484htmlReadFd(int fd, const char *url, const char *encoding, int options)
6485{
6486 htmlParserCtxtPtr ctxt;
6487 xmlParserInputPtr input;
6488 htmlDocPtr doc;
6489
6490 ctxt = htmlNewParserCtxt();
6491 if (ctxt == NULL)
6492 return(NULL);
6493
6494 htmlCtxtUseOptions(ctxt, options);
6495
6496 input = xmlNewInputFd(ctxt, url, fd, encoding, 0);
6497
6498 doc = htmlCtxtParseDocument(ctxt, input);
6499
6500 htmlFreeParserCtxt(ctxt);
6501 return(doc);
6502}
6503
6504/**
6505 * htmlReadIO:
6506 * @ioread: an I/O read function
6507 * @ioclose: an I/O close function (optional)
6508 * @ioctx: an I/O handler
6509 * @url: only used for error reporting (optional)
6510 * @encoding: the document encoding (optional)
6511 * @options: a combination of htmlParserOption(s)
6512 *
6513 * Convenience function to parse an HTML document from I/O functions
6514 * and context.
6515 *
6516 * See htmlCtxtReadIO for details.
6517 *
6518 * Returns the resulting document tree
6519 */
6520htmlDocPtr
6521htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6522 void *ioctx, const char *url, const char *encoding, int options)
6523{
6524 htmlParserCtxtPtr ctxt;
6525 xmlParserInputPtr input;
6526 htmlDocPtr doc;
6527
6528 ctxt = htmlNewParserCtxt();
6529 if (ctxt == NULL)
6530 return (NULL);
6531
6532 htmlCtxtUseOptions(ctxt, options);
6533
6534 input = xmlNewInputIO(ctxt, url, ioread, ioclose, ioctx, encoding, 0);
6535
6536 doc = htmlCtxtParseDocument(ctxt, input);
6537
6538 htmlFreeParserCtxt(ctxt);
6539 return(doc);
6540}
6541
6542/**
6543 * htmlCtxtReadDoc:
6544 * @ctxt: an HTML parser context
6545 * @str: a pointer to a zero terminated string
6546 * @URL: only used for error reporting (optional)
6547 * @encoding: the document encoding (optional)
6548 * @options: a combination of htmlParserOptions
6549 *
6550 * Parse an HTML in-memory document and build a tree.
6551 *
6552 * See htmlCtxtUseOptions for details.
6553 *
6554 * Returns the resulting document tree
6555 */
6556htmlDocPtr
6557htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6558 const char *URL, const char *encoding, int options)
6559{
6560 xmlParserInputPtr input;
6561
6562 if (ctxt == NULL)
6563 return (NULL);
6564
6565 htmlCtxtReset(ctxt);
6566 htmlCtxtUseOptions(ctxt, options);
6567
6568 input = xmlNewInputString(ctxt, URL, (const char *) str, encoding, 0);
6569
6570 return(htmlCtxtParseDocument(ctxt, input));
6571}
6572
6573/**
6574 * htmlCtxtReadFile:
6575 * @ctxt: an HTML parser context
6576 * @filename: a file or URL
6577 * @encoding: the document encoding (optional)
6578 * @options: a combination of htmlParserOptions
6579 *
6580 * Parse an HTML file from the filesystem, the network or a
6581 * user-defined resource loader.
6582 *
6583 * See xmlNewInputURL and htmlCtxtUseOptions for details.
6584 *
6585 * Returns the resulting document tree
6586 */
6587htmlDocPtr
6588htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6589 const char *encoding, int options)
6590{
6591 xmlParserInputPtr input;
6592
6593 if (ctxt == NULL)
6594 return (NULL);
6595
6596 htmlCtxtReset(ctxt);
6597 htmlCtxtUseOptions(ctxt, options);
6598
6599 input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
6600
6601 return(htmlCtxtParseDocument(ctxt, input));
6602}
6603
6604/**
6605 * htmlCtxtReadMemory:
6606 * @ctxt: an HTML parser context
6607 * @buffer: a pointer to a char array
6608 * @size: the size of the array
6609 * @URL: only used for error reporting (optional)
6610 * @encoding: the document encoding (optinal)
6611 * @options: a combination of htmlParserOptions
6612 *
6613 * Parse an HTML in-memory document and build a tree. The input buffer must
6614 * not contain any terminating null bytes.
6615 *
6616 * See htmlCtxtUseOptions for details.
6617 *
6618 * Returns the resulting document tree
6619 */
6620htmlDocPtr
6621htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6622 const char *URL, const char *encoding, int options)
6623{
6624 xmlParserInputPtr input;
6625
6626 if ((ctxt == NULL) || (size < 0))
6627 return (NULL);
6628
6629 htmlCtxtReset(ctxt);
6630 htmlCtxtUseOptions(ctxt, options);
6631
6632 input = xmlNewInputMemory(ctxt, URL, buffer, size, encoding,
6633 XML_INPUT_BUF_STATIC);
6634
6635 return(htmlCtxtParseDocument(ctxt, input));
6636}
6637
6638/**
6639 * htmlCtxtReadFd:
6640 * @ctxt: an HTML parser context
6641 * @fd: an open file descriptor
6642 * @URL: only used for error reporting (optional)
6643 * @encoding: the document encoding (optinal)
6644 * @options: a combination of htmlParserOptions
6645 *
6646 * Parse an HTML from a file descriptor and build a tree.
6647 *
6648 * See htmlCtxtUseOptions for details.
6649 *
6650 * NOTE that the file descriptor will not be closed when the
6651 * context is freed or reset.
6652 *
6653 * Returns the resulting document tree
6654 */
6655htmlDocPtr
6656htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6657 const char *URL, const char *encoding, int options)
6658{
6659 xmlParserInputPtr input;
6660
6661 if (ctxt == NULL)
6662 return(NULL);
6663
6664 htmlCtxtReset(ctxt);
6665 htmlCtxtUseOptions(ctxt, options);
6666
6667 input = xmlNewInputFd(ctxt, URL, fd, encoding, 0);
6668
6669 return(htmlCtxtParseDocument(ctxt, input));
6670}
6671
6672/**
6673 * htmlCtxtReadIO:
6674 * @ctxt: an HTML parser context
6675 * @ioread: an I/O read function
6676 * @ioclose: an I/O close function
6677 * @ioctx: an I/O handler
6678 * @URL: the base URL to use for the document
6679 * @encoding: the document encoding, or NULL
6680 * @options: a combination of htmlParserOption(s)
6681 *
6682 * Parse an HTML document from I/O functions and source and build a tree.
6683 *
6684 * See xmlNewInputIO and htmlCtxtUseOptions for details.
6685 *
6686 * Returns the resulting document tree
6687 */
6688htmlDocPtr
6689htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6690 xmlInputCloseCallback ioclose, void *ioctx,
6691 const char *URL,
6692 const char *encoding, int options)
6693{
6694 xmlParserInputPtr input;
6695
6696 if (ctxt == NULL)
6697 return (NULL);
6698
6699 htmlCtxtReset(ctxt);
6700 htmlCtxtUseOptions(ctxt, options);
6701
6702 input = xmlNewInputIO(ctxt, URL, ioread, ioclose, ioctx, encoding, 0);
6703
6704 return(htmlCtxtParseDocument(ctxt, input));
6705}
6706
6707#endif /* LIBXML_HTML_ENABLED */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette