VirtualBox

source: vbox/trunk/src/libs/libxml2-2.9.4/HTMLparser.c@ 76748

Last change on this file since 76748 was 65950, checked in by vboxsync, 8 years ago

libxml 2.9.4: fix export

  • Property svn:eol-style set to native
File size: 204.8 KB
Line 
1/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * [email protected]
7 */
8
9#define IN_LIBXML
10#include "libxml.h"
11#ifdef LIBXML_HTML_ENABLED
12
13#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
39#include <libxml/HTMLtree.h>
40#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
44#include <libxml/globals.h>
45#include <libxml/uri.h>
46
47#include "buf.h"
48#include "enc.h"
49
50#define HTML_MAX_NAMELEN 1000
51#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52#define HTML_PARSER_BUFFER_SIZE 100
53
54/* #define DEBUG */
55/* #define DEBUG_PUSH */
56
57static int htmlOmittedDefaultValue = 1;
58
59xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
61static void htmlParseComment(htmlParserCtxtPtr ctxt);
62
63/************************************************************************
64 * *
65 * Some factorized error routines *
66 * *
67 ************************************************************************/
68
69/**
70 * htmlErrMemory:
71 * @ctxt: an HTML parser context
72 * @extra: extra informations
73 *
74 * Handle a redefinition of attribute error
75 */
76static void
77htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78{
79 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
81 return;
82 if (ctxt != NULL) {
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
85 ctxt->disableSAX = 1;
86 }
87 if (extra)
88 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90 NULL, NULL, 0, 0,
91 "Memory allocation failed : %s\n", extra);
92 else
93 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
96}
97
98/**
99 * htmlParseErr:
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
105 *
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107 */
108static void LIBXML_ATTR_FORMAT(3,0)
109htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
111{
112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
114 return;
115 if (ctxt != NULL)
116 ctxt->errNo = error;
117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
120 NULL, 0, 0,
121 msg, str1, str2);
122 if (ctxt != NULL)
123 ctxt->wellFormed = 0;
124}
125
126/**
127 * htmlParseErrInt:
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
131 * @val: integer info
132 *
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134 */
135static void LIBXML_ATTR_FORMAT(3,0)
136htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
138{
139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
141 return;
142 if (ctxt != NULL)
143 ctxt->errNo = error;
144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
147 if (ctxt != NULL)
148 ctxt->wellFormed = 0;
149}
150
151/************************************************************************
152 * *
153 * Parser stacks related functions and macros *
154 * *
155 ************************************************************************/
156
157/**
158 * htmlnamePush:
159 * @ctxt: an HTML parser context
160 * @value: the element name
161 *
162 * Pushes a new element name on top of the name stack
163 *
164 * Returns 0 in case of error, the index in the stack otherwise
165 */
166static int
167htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168{
169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170 ctxt->html = 3;
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172 ctxt->html = 10;
173 if (ctxt->nameNr >= ctxt->nameMax) {
174 ctxt->nameMax *= 2;
175 ctxt->nameTab = (const xmlChar * *)
176 xmlRealloc((xmlChar * *)ctxt->nameTab,
177 ctxt->nameMax *
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
180 htmlErrMemory(ctxt, NULL);
181 return (0);
182 }
183 }
184 ctxt->nameTab[ctxt->nameNr] = value;
185 ctxt->name = value;
186 return (ctxt->nameNr++);
187}
188/**
189 * htmlnamePop:
190 * @ctxt: an HTML parser context
191 *
192 * Pops the top element name from the name stack
193 *
194 * Returns the name just removed
195 */
196static const xmlChar *
197htmlnamePop(htmlParserCtxtPtr ctxt)
198{
199 const xmlChar *ret;
200
201 if (ctxt->nameNr <= 0)
202 return (NULL);
203 ctxt->nameNr--;
204 if (ctxt->nameNr < 0)
205 return (NULL);
206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208 else
209 ctxt->name = NULL;
210 ret = ctxt->nameTab[ctxt->nameNr];
211 ctxt->nameTab[ctxt->nameNr] = NULL;
212 return (ret);
213}
214
215/**
216 * htmlNodeInfoPush:
217 * @ctxt: an HTML parser context
218 * @value: the node info
219 *
220 * Pushes a new element name on top of the node info stack
221 *
222 * Returns 0 in case of error, the index in the stack otherwise
223 */
224static int
225htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226{
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233 ctxt->nodeInfoMax *
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
237 return (0);
238 }
239 }
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
243}
244
245/**
246 * htmlNodeInfoPop:
247 * @ctxt: an HTML parser context
248 *
249 * Pops the top element name from the node info stack
250 *
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252 */
253static htmlParserNodeInfo *
254htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255{
256 if (ctxt->nodeInfoNr <= 0)
257 return (NULL);
258 ctxt->nodeInfoNr--;
259 if (ctxt->nodeInfoNr < 0)
260 return (NULL);
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263 else
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266}
267
268/*
269 * Macros for accessing the content. Those should be used only by the parser,
270 * and not exported.
271 *
272 * Dirty macros, i.e. one need to make assumption on the context to use them
273 *
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285 * strings without newlines within the parser.
286 *
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288 *
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295 */
296
297#define UPPER (toupper(*ctxt->input->cur))
298
299#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
300
301#define NXT(val) ctxt->input->cur[(val)]
302
303#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305#define CUR_PTR ctxt->input->cur
306#define BASE_PTR ctxt->input->base
307
308#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 xmlParserInputShrink(ctxt->input)
311
312#define GROW if ((ctxt->progressive == 0) && \
313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315
316#define CURRENT ((int) (*ctxt->input->cur))
317
318#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319
320/* Inported from XML */
321
322/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323#define CUR ((int) (*ctxt->input->cur))
324#define NEXT xmlNextChar(ctxt)
325
326#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327
328
329#define NEXTL(l) do { \
330 if (*(ctxt->input->cur) == '\n') { \
331 ctxt->input->line++; ctxt->input->col = 1; \
332 } else ctxt->input->col++; \
333 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
334 } while (0)
335
336/************
337 \
338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340 ************/
341
342#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344
345#define COPY_BUF(l,b,i,v) \
346 if (l == 1) b[i++] = (xmlChar) v; \
347 else i += xmlCopyChar(l,&b[i],v)
348
349/**
350 * htmlFindEncoding:
351 * @the HTML parser context
352 *
353 * Ty to find and encoding in the current data available in the input
354 * buffer this is needed to try to switch to the proper encoding when
355 * one face a character error.
356 * That's an heuristic, since it's operating outside of parsing it could
357 * try to use a meta which had been commented out, that's the reason it
358 * should only be used in case of error, not as a default.
359 *
360 * Returns an encoding string or NULL if not found, the string need to
361 * be freed
362 */
363static xmlChar *
364htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365 const xmlChar *start, *cur, *end;
366
367 if ((ctxt == NULL) || (ctxt->input == NULL) ||
368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369 (ctxt->input->buf->encoder != NULL))
370 return(NULL);
371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372 return(NULL);
373
374 start = ctxt->input->cur;
375 end = ctxt->input->end;
376 /* we also expect the input buffer to be zero terminated */
377 if (*end != 0)
378 return(NULL);
379
380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381 if (cur == NULL)
382 return(NULL);
383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
384 if (cur == NULL)
385 return(NULL);
386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
387 if (cur == NULL)
388 return(NULL);
389 cur += 8;
390 start = cur;
391 while (((*cur >= 'A') && (*cur <= 'Z')) ||
392 ((*cur >= 'a') && (*cur <= 'z')) ||
393 ((*cur >= '0') && (*cur <= '9')) ||
394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395 cur++;
396 if (cur == start)
397 return(NULL);
398 return(xmlStrndup(start, cur - start));
399}
400
401/**
402 * htmlCurrentChar:
403 * @ctxt: the HTML parser context
404 * @len: pointer to the length of the char read
405 *
406 * The current char value, if using UTF-8 this may actually span multiple
407 * bytes in the input buffer. Implement the end of line normalization:
408 * 2.11 End-of-Line Handling
409 * If the encoding is unspecified, in the case we find an ISO-Latin-1
410 * char, then the encoding converter is plugged in automatically.
411 *
412 * Returns the current char value and its length
413 */
414
415static int
416htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417 if (ctxt->instate == XML_PARSER_EOF)
418 return(0);
419
420 if (ctxt->token != 0) {
421 *len = 0;
422 return(ctxt->token);
423 }
424 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
425 /*
426 * We are supposed to handle UTF8, check it's valid
427 * From rfc2044: encoding of the Unicode values on UTF-8:
428 *
429 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
430 * 0000 0000-0000 007F 0xxxxxxx
431 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
432 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
433 *
434 * Check for the 0x110000 limit too
435 */
436 const unsigned char *cur = ctxt->input->cur;
437 unsigned char c;
438 unsigned int val;
439
440 c = *cur;
441 if (c & 0x80) {
442 if (cur[1] == 0) {
443 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
444 cur = ctxt->input->cur;
445 }
446 if ((cur[1] & 0xc0) != 0x80)
447 goto encoding_error;
448 if ((c & 0xe0) == 0xe0) {
449
450 if (cur[2] == 0) {
451 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
452 cur = ctxt->input->cur;
453 }
454 if ((cur[2] & 0xc0) != 0x80)
455 goto encoding_error;
456 if ((c & 0xf0) == 0xf0) {
457 if (cur[3] == 0) {
458 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
459 cur = ctxt->input->cur;
460 }
461 if (((c & 0xf8) != 0xf0) ||
462 ((cur[3] & 0xc0) != 0x80))
463 goto encoding_error;
464 /* 4-byte code */
465 *len = 4;
466 val = (cur[0] & 0x7) << 18;
467 val |= (cur[1] & 0x3f) << 12;
468 val |= (cur[2] & 0x3f) << 6;
469 val |= cur[3] & 0x3f;
470 } else {
471 /* 3-byte code */
472 *len = 3;
473 val = (cur[0] & 0xf) << 12;
474 val |= (cur[1] & 0x3f) << 6;
475 val |= cur[2] & 0x3f;
476 }
477 } else {
478 /* 2-byte code */
479 *len = 2;
480 val = (cur[0] & 0x1f) << 6;
481 val |= cur[1] & 0x3f;
482 }
483 if (!IS_CHAR(val)) {
484 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
485 "Char 0x%X out of allowed range\n", val);
486 }
487 return(val);
488 } else {
489 if ((*ctxt->input->cur == 0) &&
490 (ctxt->input->cur < ctxt->input->end)) {
491 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
492 "Char 0x%X out of allowed range\n", 0);
493 *len = 1;
494 return(' ');
495 }
496 /* 1-byte code */
497 *len = 1;
498 return((int) *ctxt->input->cur);
499 }
500 }
501 /*
502 * Assume it's a fixed length encoding (1) with
503 * a compatible encoding for the ASCII set, since
504 * XML constructs only use < 128 chars
505 */
506 *len = 1;
507 if ((int) *ctxt->input->cur < 0x80)
508 return((int) *ctxt->input->cur);
509
510 /*
511 * Humm this is bad, do an automatic flow conversion
512 */
513 {
514 xmlChar * guess;
515 xmlCharEncodingHandlerPtr handler;
516
517 guess = htmlFindEncoding(ctxt);
518 if (guess == NULL) {
519 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
520 } else {
521 if (ctxt->input->encoding != NULL)
522 xmlFree((xmlChar *) ctxt->input->encoding);
523 ctxt->input->encoding = guess;
524 handler = xmlFindCharEncodingHandler((const char *) guess);
525 if (handler != NULL) {
526 xmlSwitchToEncoding(ctxt, handler);
527 } else {
528 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
529 "Unsupported encoding %s", guess, NULL);
530 }
531 }
532 ctxt->charset = XML_CHAR_ENCODING_UTF8;
533 }
534
535 return(xmlCurrentChar(ctxt, len));
536
537encoding_error:
538 /*
539 * If we detect an UTF8 error that probably mean that the
540 * input encoding didn't get properly advertized in the
541 * declaration header. Report the error and switch the encoding
542 * to ISO-Latin-1 (if you don't like this policy, just declare the
543 * encoding !)
544 */
545 {
546 char buffer[150];
547
548 if (ctxt->input->end - ctxt->input->cur >= 4) {
549 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
550 ctxt->input->cur[0], ctxt->input->cur[1],
551 ctxt->input->cur[2], ctxt->input->cur[3]);
552 } else {
553 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
554 }
555 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
556 "Input is not proper UTF-8, indicate encoding !\n",
557 BAD_CAST buffer, NULL);
558 }
559
560 ctxt->charset = XML_CHAR_ENCODING_8859_1;
561 *len = 1;
562 return((int) *ctxt->input->cur);
563}
564
565/**
566 * htmlSkipBlankChars:
567 * @ctxt: the HTML parser context
568 *
569 * skip all blanks character found at that point in the input streams.
570 *
571 * Returns the number of space chars skipped
572 */
573
574static int
575htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
576 int res = 0;
577
578 while (IS_BLANK_CH(*(ctxt->input->cur))) {
579 if ((*ctxt->input->cur == 0) &&
580 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
581 xmlPopInput(ctxt);
582 } else {
583 if (*(ctxt->input->cur) == '\n') {
584 ctxt->input->line++; ctxt->input->col = 1;
585 } else ctxt->input->col++;
586 ctxt->input->cur++;
587 ctxt->nbChars++;
588 if (*ctxt->input->cur == 0)
589 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
590 }
591 res++;
592 }
593 return(res);
594}
595
596
597
598/************************************************************************
599 * *
600 * The list of HTML elements and their properties *
601 * *
602 ************************************************************************/
603
604/*
605 * Start Tag: 1 means the start tag can be ommited
606 * End Tag: 1 means the end tag can be ommited
607 * 2 means it's forbidden (empty elements)
608 * 3 means the tag is stylistic and should be closed easily
609 * Depr: this element is deprecated
610 * DTD: 1 means that this element is valid only in the Loose DTD
611 * 2 means that this element is valid only in the Frameset DTD
612 *
613 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
614 , subElements , impliedsubelt , Attributes, userdata
615 */
616
617/* Definitions and a couple of vars for HTML Elements */
618
619#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
620#define NB_FONTSTYLE 8
621#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
622#define NB_PHRASE 10
623#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
624#define NB_SPECIAL 16
625#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
626#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
627#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
628#define NB_BLOCK NB_HEADING + NB_LIST + 14
629#define FORMCTRL "input", "select", "textarea", "label", "button"
630#define NB_FORMCTRL 5
631#define PCDATA
632#define NB_PCDATA 0
633#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
634#define NB_HEADING 6
635#define LIST "ul", "ol", "dir", "menu"
636#define NB_LIST 4
637#define MODIFIER
638#define NB_MODIFIER 0
639#define FLOW BLOCK,INLINE
640#define NB_FLOW NB_BLOCK + NB_INLINE
641#define EMPTY NULL
642
643
644static const char* const html_flow[] = { FLOW, NULL } ;
645static const char* const html_inline[] = { INLINE, NULL } ;
646
647/* placeholders: elts with content but no subelements */
648static const char* const html_pcdata[] = { NULL } ;
649#define html_cdata html_pcdata
650
651
652/* ... and for HTML Attributes */
653
654#define COREATTRS "id", "class", "style", "title"
655#define NB_COREATTRS 4
656#define I18N "lang", "dir"
657#define NB_I18N 2
658#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
659#define NB_EVENTS 9
660#define ATTRS COREATTRS,I18N,EVENTS
661#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
662#define CELLHALIGN "align", "char", "charoff"
663#define NB_CELLHALIGN 3
664#define CELLVALIGN "valign"
665#define NB_CELLVALIGN 1
666
667static const char* const html_attrs[] = { ATTRS, NULL } ;
668static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
669static const char* const core_attrs[] = { COREATTRS, NULL } ;
670static const char* const i18n_attrs[] = { I18N, NULL } ;
671
672
673/* Other declarations that should go inline ... */
674static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
675 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
676 "tabindex", "onfocus", "onblur", NULL } ;
677static const char* const target_attr[] = { "target", NULL } ;
678static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
679static const char* const alt_attr[] = { "alt", NULL } ;
680static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
681static const char* const href_attrs[] = { "href", NULL } ;
682static const char* const clear_attrs[] = { "clear", NULL } ;
683static const char* const inline_p[] = { INLINE, "p", NULL } ;
684
685static const char* const flow_param[] = { FLOW, "param", NULL } ;
686static const char* const applet_attrs[] = { COREATTRS , "codebase",
687 "archive", "alt", "name", "height", "width", "align",
688 "hspace", "vspace", NULL } ;
689static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
690 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
691static const char* const basefont_attrs[] =
692 { "id", "size", "color", "face", NULL } ;
693static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
694static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
695static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
696static const char* const body_depr[] = { "background", "bgcolor", "text",
697 "link", "vlink", "alink", NULL } ;
698static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
699 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
700
701
702static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
703static const char* const col_elt[] = { "col", NULL } ;
704static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
705static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
706static const char* const dl_contents[] = { "dt", "dd", NULL } ;
707static const char* const compact_attr[] = { "compact", NULL } ;
708static const char* const label_attr[] = { "label", NULL } ;
709static const char* const fieldset_contents[] = { FLOW, "legend" } ;
710static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
711static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
712static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
713static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
714static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
715static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
716static const char* const head_attrs[] = { I18N, "profile", NULL } ;
717static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
718static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
719static const char* const version_attr[] = { "version", NULL } ;
720static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
721static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
722static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
723static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
724static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
725static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
726static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
727static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
728static const char* const align_attr[] = { "align", NULL } ;
729static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
730static const char* const map_contents[] = { BLOCK, "area", NULL } ;
731static const char* const name_attr[] = { "name", NULL } ;
732static const char* const action_attr[] = { "action", NULL } ;
733static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
734static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
735static const char* const content_attr[] = { "content", NULL } ;
736static const char* const type_attr[] = { "type", NULL } ;
737static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
738static const char* const object_contents[] = { FLOW, "param", NULL } ;
739static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
740static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
741static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
742static const char* const option_elt[] = { "option", NULL } ;
743static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
744static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
745static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
746static const char* const width_attr[] = { "width", NULL } ;
747static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
748static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
749static const char* const language_attr[] = { "language", NULL } ;
750static const char* const select_content[] = { "optgroup", "option", NULL } ;
751static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
752static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
753static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
754static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
755static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
756static const char* const tr_elt[] = { "tr", NULL } ;
757static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
758static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
759static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
760static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
761static const char* const tr_contents[] = { "th", "td", NULL } ;
762static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
763static const char* const li_elt[] = { "li", NULL } ;
764static const char* const ul_depr[] = { "type", "compact", NULL} ;
765static const char* const dir_attr[] = { "dir", NULL} ;
766
767#define DECL (const char**)
768
769static const htmlElemDesc
770html40ElementTable[] = {
771{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
772 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
773},
774{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
775 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
776},
777{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
778 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
779},
780{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
781 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
782},
783{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
784 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
785},
786{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
787 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
788},
789{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
790 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
791},
792{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
793 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
794},
795{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
796 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
797},
798{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
799 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
800},
801{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
802 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
803},
804{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
805 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
806},
807{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
808 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
809},
810{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
811 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
812},
813{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
814 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
815},
816{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
817 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
818},
819{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
820 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
821},
822{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
823 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
824},
825{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
826 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
827},
828{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
829 EMPTY , NULL , DECL col_attrs , NULL, NULL
830},
831{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
832 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
833},
834{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
835 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
836},
837{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
838 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
839},
840{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
841 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
842},
843{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
844 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
845},
846{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
847 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
848},
849{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
850 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
851},
852{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
853 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854},
855{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
856 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
857},
858{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
859 EMPTY, NULL, DECL embed_attrs, NULL, NULL
860},
861{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
862 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
863},
864{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
865 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
866},
867{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
868 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
869},
870{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
871 EMPTY, NULL, NULL, DECL frame_attrs, NULL
872},
873{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
874 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
875},
876{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
877 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
878},
879{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
880 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
881},
882{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
883 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
884},
885{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
886 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
887},
888{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
889 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
890},
891{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
892 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
893},
894{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
895 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
896},
897{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
898 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
899},
900{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
901 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
902},
903{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
904 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
905},
906{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
907 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
908},
909{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
910 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
911},
912{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
913 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
914},
915{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
916 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
917},
918{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
919 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
920},
921{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
922 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
923},
924{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
925 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
926},
927{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
928 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
929},
930{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
931 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
932},
933{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
934 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
935},
936{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
937 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
938},
939{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
940 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
941},
942{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
943 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
944},
945{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
946 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
947},
948{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
949 DECL html_flow, "div", DECL html_attrs, NULL, NULL
950},
951{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
952 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
953},
954{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
955 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
956},
957{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
958 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
959},
960{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
961 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
962},
963{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
964 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
965},
966{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
967 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
968},
969{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
970 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
971},
972{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
973 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
974},
975{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
976 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
977},
978{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
979 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
980},
981{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
982 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
983},
984{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
985 DECL select_content, NULL, DECL select_attrs, NULL, NULL
986},
987{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
988 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
989},
990{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
991 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
992},
993{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
994 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
995},
996{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
997 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
998},
999{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1000 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1001},
1002{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1003 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1004},
1005{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1006 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1007},
1008{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1009 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1010},
1011{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1012 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1013},
1014{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1015 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1016},
1017{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1018 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1019},
1020{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1021 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1022},
1023{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1024 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1025},
1026{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1027 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1028},
1029{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1030 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1031},
1032{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1033 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1034},
1035{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1036 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1037},
1038{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1039 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1040},
1041{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1042 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1043},
1044{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1045 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1046}
1047};
1048
1049/*
1050 * start tags that imply the end of current element
1051 */
1052static const char * const htmlStartClose[] = {
1053"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1054 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1055 "listing", "xmp", "head", NULL,
1056"head", "p", NULL,
1057"title", "p", NULL,
1058"body", "head", "style", "link", "title", "p", NULL,
1059"frameset", "head", "style", "link", "title", "p", NULL,
1060"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1061 "pre", "listing", "xmp", "head", "li", NULL,
1062"hr", "p", "head", NULL,
1063"h1", "p", "head", NULL,
1064"h2", "p", "head", NULL,
1065"h3", "p", "head", NULL,
1066"h4", "p", "head", NULL,
1067"h5", "p", "head", NULL,
1068"h6", "p", "head", NULL,
1069"dir", "p", "head", NULL,
1070"address", "p", "head", "ul", NULL,
1071"pre", "p", "head", "ul", NULL,
1072"listing", "p", "head", NULL,
1073"xmp", "p", "head", NULL,
1074"blockquote", "p", "head", NULL,
1075"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1076 "xmp", "head", NULL,
1077"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1078 "head", "dd", NULL,
1079"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1080 "head", "dt", NULL,
1081"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1082 "listing", "xmp", NULL,
1083"ol", "p", "head", "ul", NULL,
1084"menu", "p", "head", "ul", NULL,
1085"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1086"div", "p", "head", NULL,
1087"noscript", "p", NULL,
1088"center", "font", "b", "i", "p", "head", NULL,
1089"a", "a", "head", NULL,
1090"caption", "p", NULL,
1091"colgroup", "caption", "colgroup", "col", "p", NULL,
1092"col", "caption", "col", "p", NULL,
1093"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1094 "listing", "xmp", "a", NULL,
1095"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1096"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1097"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1098"thead", "caption", "col", "colgroup", NULL,
1099"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1100 "tbody", "p", NULL,
1101"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1102 "tfoot", "tbody", "p", NULL,
1103"optgroup", "option", NULL,
1104"option", "option", NULL,
1105"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1106 "pre", "listing", "xmp", "a", NULL,
1107/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1108"tt", "head", NULL,
1109"i", "head", NULL,
1110"b", "head", NULL,
1111"u", "head", NULL,
1112"s", "head", NULL,
1113"strike", "head", NULL,
1114"big", "head", NULL,
1115"small", "head", NULL,
1116
1117"em", "head", NULL,
1118"strong", "head", NULL,
1119"dfn", "head", NULL,
1120"code", "head", NULL,
1121"samp", "head", NULL,
1122"kbd", "head", NULL,
1123"var", "head", NULL,
1124"cite", "head", NULL,
1125"abbr", "head", NULL,
1126"acronym", "head", NULL,
1127
1128/* "a" */
1129"img", "head", NULL,
1130/* "applet" */
1131/* "embed" */
1132/* "object" */
1133"font", "head", NULL,
1134/* "basefont" */
1135"br", "head", NULL,
1136/* "script" */
1137"map", "head", NULL,
1138"q", "head", NULL,
1139"sub", "head", NULL,
1140"sup", "head", NULL,
1141"span", "head", NULL,
1142"bdo", "head", NULL,
1143"iframe", "head", NULL,
1144NULL
1145};
1146
1147/*
1148 * The list of HTML elements which are supposed not to have
1149 * CDATA content and where a p element will be implied
1150 *
1151 * TODO: extend that list by reading the HTML SGML DTD on
1152 * implied paragraph
1153 */
1154static const char *const htmlNoContentElements[] = {
1155 "html",
1156 "head",
1157 NULL
1158};
1159
1160/*
1161 * The list of HTML attributes which are of content %Script;
1162 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1163 * it assumes the name starts with 'on'
1164 */
1165static const char *const htmlScriptAttributes[] = {
1166 "onclick",
1167 "ondblclick",
1168 "onmousedown",
1169 "onmouseup",
1170 "onmouseover",
1171 "onmousemove",
1172 "onmouseout",
1173 "onkeypress",
1174 "onkeydown",
1175 "onkeyup",
1176 "onload",
1177 "onunload",
1178 "onfocus",
1179 "onblur",
1180 "onsubmit",
1181 "onreset",
1182 "onchange",
1183 "onselect"
1184};
1185
1186/*
1187 * This table is used by the htmlparser to know what to do with
1188 * broken html pages. By assigning different priorities to different
1189 * elements the parser can decide how to handle extra endtags.
1190 * Endtags are only allowed to close elements with lower or equal
1191 * priority.
1192 */
1193
1194typedef struct {
1195 const char *name;
1196 int priority;
1197} elementPriority;
1198
1199static const elementPriority htmlEndPriority[] = {
1200 {"div", 150},
1201 {"td", 160},
1202 {"th", 160},
1203 {"tr", 170},
1204 {"thead", 180},
1205 {"tbody", 180},
1206 {"tfoot", 180},
1207 {"table", 190},
1208 {"head", 200},
1209 {"body", 200},
1210 {"html", 220},
1211 {NULL, 100} /* Default priority */
1212};
1213
1214static const char** htmlStartCloseIndex[100];
1215static int htmlStartCloseIndexinitialized = 0;
1216
1217/************************************************************************
1218 * *
1219 * functions to handle HTML specific data *
1220 * *
1221 ************************************************************************/
1222
1223/**
1224 * htmlInitAutoClose:
1225 *
1226 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1227 * This is not reentrant. Call xmlInitParser() once before processing in
1228 * case of use in multithreaded programs.
1229 */
1230void
1231htmlInitAutoClose(void) {
1232 int indx, i = 0;
1233
1234 if (htmlStartCloseIndexinitialized) return;
1235
1236 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1237 indx = 0;
1238 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1239 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1240 while (htmlStartClose[i] != NULL) i++;
1241 i++;
1242 }
1243 htmlStartCloseIndexinitialized = 1;
1244}
1245
1246/**
1247 * htmlTagLookup:
1248 * @tag: The tag name in lowercase
1249 *
1250 * Lookup the HTML tag in the ElementTable
1251 *
1252 * Returns the related htmlElemDescPtr or NULL if not found.
1253 */
1254const htmlElemDesc *
1255htmlTagLookup(const xmlChar *tag) {
1256 unsigned int i;
1257
1258 for (i = 0; i < (sizeof(html40ElementTable) /
1259 sizeof(html40ElementTable[0]));i++) {
1260 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1261 return((htmlElemDescPtr) &html40ElementTable[i]);
1262 }
1263 return(NULL);
1264}
1265
1266/**
1267 * htmlGetEndPriority:
1268 * @name: The name of the element to look up the priority for.
1269 *
1270 * Return value: The "endtag" priority.
1271 **/
1272static int
1273htmlGetEndPriority (const xmlChar *name) {
1274 int i = 0;
1275
1276 while ((htmlEndPriority[i].name != NULL) &&
1277 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1278 i++;
1279
1280 return(htmlEndPriority[i].priority);
1281}
1282
1283
1284/**
1285 * htmlCheckAutoClose:
1286 * @newtag: The new tag name
1287 * @oldtag: The old tag name
1288 *
1289 * Checks whether the new tag is one of the registered valid tags for
1290 * closing old.
1291 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1292 *
1293 * Returns 0 if no, 1 if yes.
1294 */
1295static int
1296htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1297{
1298 int i, indx;
1299 const char **closed = NULL;
1300
1301 if (htmlStartCloseIndexinitialized == 0)
1302 htmlInitAutoClose();
1303
1304 /* inefficient, but not a big deal */
1305 for (indx = 0; indx < 100; indx++) {
1306 closed = htmlStartCloseIndex[indx];
1307 if (closed == NULL)
1308 return (0);
1309 if (xmlStrEqual(BAD_CAST * closed, newtag))
1310 break;
1311 }
1312
1313 i = closed - htmlStartClose;
1314 i++;
1315 while (htmlStartClose[i] != NULL) {
1316 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1317 return (1);
1318 }
1319 i++;
1320 }
1321 return (0);
1322}
1323
1324/**
1325 * htmlAutoCloseOnClose:
1326 * @ctxt: an HTML parser context
1327 * @newtag: The new tag name
1328 * @force: force the tag closure
1329 *
1330 * The HTML DTD allows an ending tag to implicitly close other tags.
1331 */
1332static void
1333htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1334{
1335 const htmlElemDesc *info;
1336 int i, priority;
1337
1338 priority = htmlGetEndPriority(newtag);
1339
1340 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1341
1342 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1343 break;
1344 /*
1345 * A missplaced endtag can only close elements with lower
1346 * or equal priority, so if we find an element with higher
1347 * priority before we find an element with
1348 * matching name, we just ignore this endtag
1349 */
1350 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1351 return;
1352 }
1353 if (i < 0)
1354 return;
1355
1356 while (!xmlStrEqual(newtag, ctxt->name)) {
1357 info = htmlTagLookup(ctxt->name);
1358 if ((info != NULL) && (info->endTag == 3)) {
1359 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1360 "Opening and ending tag mismatch: %s and %s\n",
1361 newtag, ctxt->name);
1362 }
1363 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1364 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1365 htmlnamePop(ctxt);
1366 }
1367}
1368
1369/**
1370 * htmlAutoCloseOnEnd:
1371 * @ctxt: an HTML parser context
1372 *
1373 * Close all remaining tags at the end of the stream
1374 */
1375static void
1376htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1377{
1378 int i;
1379
1380 if (ctxt->nameNr == 0)
1381 return;
1382 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1383 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1384 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1385 htmlnamePop(ctxt);
1386 }
1387}
1388
1389/**
1390 * htmlAutoClose:
1391 * @ctxt: an HTML parser context
1392 * @newtag: The new tag name or NULL
1393 *
1394 * The HTML DTD allows a tag to implicitly close other tags.
1395 * The list is kept in htmlStartClose array. This function is
1396 * called when a new tag has been detected and generates the
1397 * appropriates closes if possible/needed.
1398 * If newtag is NULL this mean we are at the end of the resource
1399 * and we should check
1400 */
1401static void
1402htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1403{
1404 while ((newtag != NULL) && (ctxt->name != NULL) &&
1405 (htmlCheckAutoClose(newtag, ctxt->name))) {
1406 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1407 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1408 htmlnamePop(ctxt);
1409 }
1410 if (newtag == NULL) {
1411 htmlAutoCloseOnEnd(ctxt);
1412 return;
1413 }
1414 while ((newtag == NULL) && (ctxt->name != NULL) &&
1415 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1416 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1417 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1418 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1419 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1420 htmlnamePop(ctxt);
1421 }
1422}
1423
1424/**
1425 * htmlAutoCloseTag:
1426 * @doc: the HTML document
1427 * @name: The tag name
1428 * @elem: the HTML element
1429 *
1430 * The HTML DTD allows a tag to implicitly close other tags.
1431 * The list is kept in htmlStartClose array. This function checks
1432 * if the element or one of it's children would autoclose the
1433 * given tag.
1434 *
1435 * Returns 1 if autoclose, 0 otherwise
1436 */
1437int
1438htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1439 htmlNodePtr child;
1440
1441 if (elem == NULL) return(1);
1442 if (xmlStrEqual(name, elem->name)) return(0);
1443 if (htmlCheckAutoClose(elem->name, name)) return(1);
1444 child = elem->children;
1445 while (child != NULL) {
1446 if (htmlAutoCloseTag(doc, name, child)) return(1);
1447 child = child->next;
1448 }
1449 return(0);
1450}
1451
1452/**
1453 * htmlIsAutoClosed:
1454 * @doc: the HTML document
1455 * @elem: the HTML element
1456 *
1457 * The HTML DTD allows a tag to implicitly close other tags.
1458 * The list is kept in htmlStartClose array. This function checks
1459 * if a tag is autoclosed by one of it's child
1460 *
1461 * Returns 1 if autoclosed, 0 otherwise
1462 */
1463int
1464htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1465 htmlNodePtr child;
1466
1467 if (elem == NULL) return(1);
1468 child = elem->children;
1469 while (child != NULL) {
1470 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1471 child = child->next;
1472 }
1473 return(0);
1474}
1475
1476/**
1477 * htmlCheckImplied:
1478 * @ctxt: an HTML parser context
1479 * @newtag: The new tag name
1480 *
1481 * The HTML DTD allows a tag to exists only implicitly
1482 * called when a new tag has been detected and generates the
1483 * appropriates implicit tags if missing
1484 */
1485static void
1486htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1487 int i;
1488
1489 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1490 return;
1491 if (!htmlOmittedDefaultValue)
1492 return;
1493 if (xmlStrEqual(newtag, BAD_CAST"html"))
1494 return;
1495 if (ctxt->nameNr <= 0) {
1496 htmlnamePush(ctxt, BAD_CAST"html");
1497 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1498 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1499 }
1500 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1501 return;
1502 if ((ctxt->nameNr <= 1) &&
1503 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1504 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1505 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1506 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1507 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1508 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1509 if (ctxt->html >= 3) {
1510 /* we already saw or generated an <head> before */
1511 return;
1512 }
1513 /*
1514 * dropped OBJECT ... i you put it first BODY will be
1515 * assumed !
1516 */
1517 htmlnamePush(ctxt, BAD_CAST"head");
1518 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1519 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1520 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1521 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1522 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1523 if (ctxt->html >= 10) {
1524 /* we already saw or generated a <body> before */
1525 return;
1526 }
1527 for (i = 0;i < ctxt->nameNr;i++) {
1528 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1529 return;
1530 }
1531 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1532 return;
1533 }
1534 }
1535
1536 htmlnamePush(ctxt, BAD_CAST"body");
1537 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1538 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1539 }
1540}
1541
1542/**
1543 * htmlCheckParagraph
1544 * @ctxt: an HTML parser context
1545 *
1546 * Check whether a p element need to be implied before inserting
1547 * characters in the current element.
1548 *
1549 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1550 * in case of error.
1551 */
1552
1553static int
1554htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1555 const xmlChar *tag;
1556 int i;
1557
1558 if (ctxt == NULL)
1559 return(-1);
1560 tag = ctxt->name;
1561 if (tag == NULL) {
1562 htmlAutoClose(ctxt, BAD_CAST"p");
1563 htmlCheckImplied(ctxt, BAD_CAST"p");
1564 htmlnamePush(ctxt, BAD_CAST"p");
1565 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1567 return(1);
1568 }
1569 if (!htmlOmittedDefaultValue)
1570 return(0);
1571 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1572 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1573 htmlAutoClose(ctxt, BAD_CAST"p");
1574 htmlCheckImplied(ctxt, BAD_CAST"p");
1575 htmlnamePush(ctxt, BAD_CAST"p");
1576 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1577 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1578 return(1);
1579 }
1580 }
1581 return(0);
1582}
1583
1584/**
1585 * htmlIsScriptAttribute:
1586 * @name: an attribute name
1587 *
1588 * Check if an attribute is of content type Script
1589 *
1590 * Returns 1 is the attribute is a script 0 otherwise
1591 */
1592int
1593htmlIsScriptAttribute(const xmlChar *name) {
1594 unsigned int i;
1595
1596 if (name == NULL)
1597 return(0);
1598 /*
1599 * all script attributes start with 'on'
1600 */
1601 if ((name[0] != 'o') || (name[1] != 'n'))
1602 return(0);
1603 for (i = 0;
1604 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1605 i++) {
1606 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1607 return(1);
1608 }
1609 return(0);
1610}
1611
1612/************************************************************************
1613 * *
1614 * The list of HTML predefined entities *
1615 * *
1616 ************************************************************************/
1617
1618
1619static const htmlEntityDesc html40EntitiesTable[] = {
1620/*
1621 * the 4 absolute ones, plus apostrophe.
1622 */
1623{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1624{ 38, "amp", "ampersand, U+0026 ISOnum" },
1625{ 39, "apos", "single quote" },
1626{ 60, "lt", "less-than sign, U+003C ISOnum" },
1627{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1628
1629/*
1630 * A bunch still in the 128-255 range
1631 * Replacing them depend really on the charset used.
1632 */
1633{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1634{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1635{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1636{ 163, "pound","pound sign, U+00A3 ISOnum" },
1637{ 164, "curren","currency sign, U+00A4 ISOnum" },
1638{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1639{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1640{ 167, "sect", "section sign, U+00A7 ISOnum" },
1641{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1642{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1643{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1644{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1645{ 172, "not", "not sign, U+00AC ISOnum" },
1646{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1647{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1648{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1649{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1650{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1651{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1652{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1653{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1654{ 181, "micro","micro sign, U+00B5 ISOnum" },
1655{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1656{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1657{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1658{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1659{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1660{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1661{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1662{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1663{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1664{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1665{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1666{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1667{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1668{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1669{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1670{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1671{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1672{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1673{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1674{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1675{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1676{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1677{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1678{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1679{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1680{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1681{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1682{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1683{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1684{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1685{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1686{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1687{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1688{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1689{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1690{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1691{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1692{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1693{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1694{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1695{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1696{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1697{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1698{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1699{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1700{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1701{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1702{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1703{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1704{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1705{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1706{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1707{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1708{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1709{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1710{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1711{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1712{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1713{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1714{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1715{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1716{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1717{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1718{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1719{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1720{ 247, "divide","division sign, U+00F7 ISOnum" },
1721{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1722{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1723{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1724{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1725{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1726{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1727{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1728{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1729
1730{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1731{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1732{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1733{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1734{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1735
1736/*
1737 * Anything below should really be kept as entities references
1738 */
1739{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1740
1741{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1742{ 732, "tilde","small tilde, U+02DC ISOdia" },
1743
1744{ 913, "Alpha","greek capital letter alpha, U+0391" },
1745{ 914, "Beta", "greek capital letter beta, U+0392" },
1746{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1747{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1748{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1749{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1750{ 919, "Eta", "greek capital letter eta, U+0397" },
1751{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1752{ 921, "Iota", "greek capital letter iota, U+0399" },
1753{ 922, "Kappa","greek capital letter kappa, U+039A" },
1754{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1755{ 924, "Mu", "greek capital letter mu, U+039C" },
1756{ 925, "Nu", "greek capital letter nu, U+039D" },
1757{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1758{ 927, "Omicron","greek capital letter omicron, U+039F" },
1759{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1760{ 929, "Rho", "greek capital letter rho, U+03A1" },
1761{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1762{ 932, "Tau", "greek capital letter tau, U+03A4" },
1763{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1764{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1765{ 935, "Chi", "greek capital letter chi, U+03A7" },
1766{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1767{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1768
1769{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1770{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1771{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1772{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1773{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1774{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1775{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1776{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1777{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1778{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1779{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1780{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1781{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1782{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1783{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1784{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1785{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1786{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1787{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1788{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1789{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1790{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1791{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1792{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1793{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1794{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1795{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1796{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1797
1798{ 8194, "ensp", "en space, U+2002 ISOpub" },
1799{ 8195, "emsp", "em space, U+2003 ISOpub" },
1800{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1801{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1802{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1803{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1804{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1805{ 8211, "ndash","en dash, U+2013 ISOpub" },
1806{ 8212, "mdash","em dash, U+2014 ISOpub" },
1807{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1808{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1809{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1810{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1811{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1812{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1813{ 8224, "dagger","dagger, U+2020 ISOpub" },
1814{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1815
1816{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1817{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1818
1819{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1820
1821{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1822{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1823
1824{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1825{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1826
1827{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1828{ 8260, "frasl","fraction slash, U+2044 NEW" },
1829
1830{ 8364, "euro", "euro sign, U+20AC NEW" },
1831
1832{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1833{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1834{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1835{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1836{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1837{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1838{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1839{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1840{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1841{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1842{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1843{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1844{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1845{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1846{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1847{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1848
1849{ 8704, "forall","for all, U+2200 ISOtech" },
1850{ 8706, "part", "partial differential, U+2202 ISOtech" },
1851{ 8707, "exist","there exists, U+2203 ISOtech" },
1852{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1853{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1854{ 8712, "isin", "element of, U+2208 ISOtech" },
1855{ 8713, "notin","not an element of, U+2209 ISOtech" },
1856{ 8715, "ni", "contains as member, U+220B ISOtech" },
1857{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1858{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1859{ 8722, "minus","minus sign, U+2212 ISOtech" },
1860{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1861{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1862{ 8733, "prop", "proportional to, U+221D ISOtech" },
1863{ 8734, "infin","infinity, U+221E ISOtech" },
1864{ 8736, "ang", "angle, U+2220 ISOamso" },
1865{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1866{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1867{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1868{ 8746, "cup", "union = cup, U+222A ISOtech" },
1869{ 8747, "int", "integral, U+222B ISOtech" },
1870{ 8756, "there4","therefore, U+2234 ISOtech" },
1871{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1872{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1873{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1874{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1875{ 8801, "equiv","identical to, U+2261 ISOtech" },
1876{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1877{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1878{ 8834, "sub", "subset of, U+2282 ISOtech" },
1879{ 8835, "sup", "superset of, U+2283 ISOtech" },
1880{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1881{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1882{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1883{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1884{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1885{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1886{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1887{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1888{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1889{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1890{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1891{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1892{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1893{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1894
1895{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1896{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1897{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1898{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1899
1900};
1901
1902/************************************************************************
1903 * *
1904 * Commodity functions to handle entities *
1905 * *
1906 ************************************************************************/
1907
1908/*
1909 * Macro used to grow the current buffer.
1910 */
1911#define growBuffer(buffer) { \
1912 xmlChar *tmp; \
1913 buffer##_size *= 2; \
1914 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1915 if (tmp == NULL) { \
1916 htmlErrMemory(ctxt, "growing buffer\n"); \
1917 xmlFree(buffer); \
1918 return(NULL); \
1919 } \
1920 buffer = tmp; \
1921}
1922
1923/**
1924 * htmlEntityLookup:
1925 * @name: the entity name
1926 *
1927 * Lookup the given entity in EntitiesTable
1928 *
1929 * TODO: the linear scan is really ugly, an hash table is really needed.
1930 *
1931 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1932 */
1933const htmlEntityDesc *
1934htmlEntityLookup(const xmlChar *name) {
1935 unsigned int i;
1936
1937 for (i = 0;i < (sizeof(html40EntitiesTable)/
1938 sizeof(html40EntitiesTable[0]));i++) {
1939 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1940 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1941 }
1942 }
1943 return(NULL);
1944}
1945
1946/**
1947 * htmlEntityValueLookup:
1948 * @value: the entity's unicode value
1949 *
1950 * Lookup the given entity in EntitiesTable
1951 *
1952 * TODO: the linear scan is really ugly, an hash table is really needed.
1953 *
1954 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1955 */
1956const htmlEntityDesc *
1957htmlEntityValueLookup(unsigned int value) {
1958 unsigned int i;
1959
1960 for (i = 0;i < (sizeof(html40EntitiesTable)/
1961 sizeof(html40EntitiesTable[0]));i++) {
1962 if (html40EntitiesTable[i].value >= value) {
1963 if (html40EntitiesTable[i].value > value)
1964 break;
1965 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1966 }
1967 }
1968 return(NULL);
1969}
1970
1971/**
1972 * UTF8ToHtml:
1973 * @out: a pointer to an array of bytes to store the result
1974 * @outlen: the length of @out
1975 * @in: a pointer to an array of UTF-8 chars
1976 * @inlen: the length of @in
1977 *
1978 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1979 * plus HTML entities block of chars out.
1980 *
1981 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1982 * The value of @inlen after return is the number of octets consumed
1983 * as the return value is positive, else unpredictable.
1984 * The value of @outlen after return is the number of octets consumed.
1985 */
1986int
1987UTF8ToHtml(unsigned char* out, int *outlen,
1988 const unsigned char* in, int *inlen) {
1989 const unsigned char* processed = in;
1990 const unsigned char* outend;
1991 const unsigned char* outstart = out;
1992 const unsigned char* instart = in;
1993 const unsigned char* inend;
1994 unsigned int c, d;
1995 int trailing;
1996
1997 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1998 if (in == NULL) {
1999 /*
2000 * initialization nothing to do
2001 */
2002 *outlen = 0;
2003 *inlen = 0;
2004 return(0);
2005 }
2006 inend = in + (*inlen);
2007 outend = out + (*outlen);
2008 while (in < inend) {
2009 d = *in++;
2010 if (d < 0x80) { c= d; trailing= 0; }
2011 else if (d < 0xC0) {
2012 /* trailing byte in leading position */
2013 *outlen = out - outstart;
2014 *inlen = processed - instart;
2015 return(-2);
2016 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2017 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2018 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2019 else {
2020 /* no chance for this in Ascii */
2021 *outlen = out - outstart;
2022 *inlen = processed - instart;
2023 return(-2);
2024 }
2025
2026 if (inend - in < trailing) {
2027 break;
2028 }
2029
2030 for ( ; trailing; trailing--) {
2031 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2032 break;
2033 c <<= 6;
2034 c |= d & 0x3F;
2035 }
2036
2037 /* assertion: c is a single UTF-4 value */
2038 if (c < 0x80) {
2039 if (out + 1 >= outend)
2040 break;
2041 *out++ = c;
2042 } else {
2043 int len;
2044 const htmlEntityDesc * ent;
2045 const char *cp;
2046 char nbuf[16];
2047
2048 /*
2049 * Try to lookup a predefined HTML entity for it
2050 */
2051
2052 ent = htmlEntityValueLookup(c);
2053 if (ent == NULL) {
2054 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2055 cp = nbuf;
2056 }
2057 else
2058 cp = ent->name;
2059 len = strlen(cp);
2060 if (out + 2 + len >= outend)
2061 break;
2062 *out++ = '&';
2063 memcpy(out, cp, len);
2064 out += len;
2065 *out++ = ';';
2066 }
2067 processed = in;
2068 }
2069 *outlen = out - outstart;
2070 *inlen = processed - instart;
2071 return(0);
2072}
2073
2074/**
2075 * htmlEncodeEntities:
2076 * @out: a pointer to an array of bytes to store the result
2077 * @outlen: the length of @out
2078 * @in: a pointer to an array of UTF-8 chars
2079 * @inlen: the length of @in
2080 * @quoteChar: the quote character to escape (' or ") or zero.
2081 *
2082 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2083 * plus HTML entities block of chars out.
2084 *
2085 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2086 * The value of @inlen after return is the number of octets consumed
2087 * as the return value is positive, else unpredictable.
2088 * The value of @outlen after return is the number of octets consumed.
2089 */
2090int
2091htmlEncodeEntities(unsigned char* out, int *outlen,
2092 const unsigned char* in, int *inlen, int quoteChar) {
2093 const unsigned char* processed = in;
2094 const unsigned char* outend;
2095 const unsigned char* outstart = out;
2096 const unsigned char* instart = in;
2097 const unsigned char* inend;
2098 unsigned int c, d;
2099 int trailing;
2100
2101 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2102 return(-1);
2103 outend = out + (*outlen);
2104 inend = in + (*inlen);
2105 while (in < inend) {
2106 d = *in++;
2107 if (d < 0x80) { c= d; trailing= 0; }
2108 else if (d < 0xC0) {
2109 /* trailing byte in leading position */
2110 *outlen = out - outstart;
2111 *inlen = processed - instart;
2112 return(-2);
2113 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2114 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2115 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2116 else {
2117 /* no chance for this in Ascii */
2118 *outlen = out - outstart;
2119 *inlen = processed - instart;
2120 return(-2);
2121 }
2122
2123 if (inend - in < trailing)
2124 break;
2125
2126 while (trailing--) {
2127 if (((d= *in++) & 0xC0) != 0x80) {
2128 *outlen = out - outstart;
2129 *inlen = processed - instart;
2130 return(-2);
2131 }
2132 c <<= 6;
2133 c |= d & 0x3F;
2134 }
2135
2136 /* assertion: c is a single UTF-4 value */
2137 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2138 (c != '&') && (c != '<') && (c != '>')) {
2139 if (out >= outend)
2140 break;
2141 *out++ = c;
2142 } else {
2143 const htmlEntityDesc * ent;
2144 const char *cp;
2145 char nbuf[16];
2146 int len;
2147
2148 /*
2149 * Try to lookup a predefined HTML entity for it
2150 */
2151 ent = htmlEntityValueLookup(c);
2152 if (ent == NULL) {
2153 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2154 cp = nbuf;
2155 }
2156 else
2157 cp = ent->name;
2158 len = strlen(cp);
2159 if (out + 2 + len > outend)
2160 break;
2161 *out++ = '&';
2162 memcpy(out, cp, len);
2163 out += len;
2164 *out++ = ';';
2165 }
2166 processed = in;
2167 }
2168 *outlen = out - outstart;
2169 *inlen = processed - instart;
2170 return(0);
2171}
2172
2173/************************************************************************
2174 * *
2175 * Commodity functions to handle streams *
2176 * *
2177 ************************************************************************/
2178
2179/**
2180 * htmlNewInputStream:
2181 * @ctxt: an HTML parser context
2182 *
2183 * Create a new input stream structure
2184 * Returns the new input stream or NULL
2185 */
2186static htmlParserInputPtr
2187htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2188 htmlParserInputPtr input;
2189
2190 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2191 if (input == NULL) {
2192 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2193 return(NULL);
2194 }
2195 memset(input, 0, sizeof(htmlParserInput));
2196 input->filename = NULL;
2197 input->directory = NULL;
2198 input->base = NULL;
2199 input->cur = NULL;
2200 input->buf = NULL;
2201 input->line = 1;
2202 input->col = 1;
2203 input->buf = NULL;
2204 input->free = NULL;
2205 input->version = NULL;
2206 input->consumed = 0;
2207 input->length = 0;
2208 return(input);
2209}
2210
2211
2212/************************************************************************
2213 * *
2214 * Commodity functions, cleanup needed ? *
2215 * *
2216 ************************************************************************/
2217/*
2218 * all tags allowing pc data from the html 4.01 loose dtd
2219 * NOTE: it might be more apropriate to integrate this information
2220 * into the html40ElementTable array but I don't want to risk any
2221 * binary incomptibility
2222 */
2223static const char *allowPCData[] = {
2224 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2225 "blockquote", "body", "button", "caption", "center", "cite", "code",
2226 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2227 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2228 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2229 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2230};
2231
2232/**
2233 * areBlanks:
2234 * @ctxt: an HTML parser context
2235 * @str: a xmlChar *
2236 * @len: the size of @str
2237 *
2238 * Is this a sequence of blank chars that one can ignore ?
2239 *
2240 * Returns 1 if ignorable 0 otherwise.
2241 */
2242
2243static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2244 unsigned int i;
2245 int j;
2246 xmlNodePtr lastChild;
2247 xmlDtdPtr dtd;
2248
2249 for (j = 0;j < len;j++)
2250 if (!(IS_BLANK_CH(str[j]))) return(0);
2251
2252 if (CUR == 0) return(1);
2253 if (CUR != '<') return(0);
2254 if (ctxt->name == NULL)
2255 return(1);
2256 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2257 return(1);
2258 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2259 return(1);
2260
2261 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2262 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2263 dtd = xmlGetIntSubset(ctxt->myDoc);
2264 if (dtd != NULL && dtd->ExternalID != NULL) {
2265 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2266 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2267 return(1);
2268 }
2269 }
2270
2271 if (ctxt->node == NULL) return(0);
2272 lastChild = xmlGetLastChild(ctxt->node);
2273 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2274 lastChild = lastChild->prev;
2275 if (lastChild == NULL) {
2276 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2277 (ctxt->node->content != NULL)) return(0);
2278 /* keep ws in constructs like ...<b> </b>...
2279 for all tags "b" allowing PCDATA */
2280 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2281 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2282 return(0);
2283 }
2284 }
2285 } else if (xmlNodeIsText(lastChild)) {
2286 return(0);
2287 } else {
2288 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2289 for all tags "p" allowing PCDATA */
2290 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2291 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2292 return(0);
2293 }
2294 }
2295 }
2296 return(1);
2297}
2298
2299/**
2300 * htmlNewDocNoDtD:
2301 * @URI: URI for the dtd, or NULL
2302 * @ExternalID: the external ID of the DTD, or NULL
2303 *
2304 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2305 * are NULL
2306 *
2307 * Returns a new document, do not initialize the DTD if not provided
2308 */
2309htmlDocPtr
2310htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2311 xmlDocPtr cur;
2312
2313 /*
2314 * Allocate a new document and fill the fields.
2315 */
2316 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2317 if (cur == NULL) {
2318 htmlErrMemory(NULL, "HTML document creation failed\n");
2319 return(NULL);
2320 }
2321 memset(cur, 0, sizeof(xmlDoc));
2322
2323 cur->type = XML_HTML_DOCUMENT_NODE;
2324 cur->version = NULL;
2325 cur->intSubset = NULL;
2326 cur->doc = cur;
2327 cur->name = NULL;
2328 cur->children = NULL;
2329 cur->extSubset = NULL;
2330 cur->oldNs = NULL;
2331 cur->encoding = NULL;
2332 cur->standalone = 1;
2333 cur->compression = 0;
2334 cur->ids = NULL;
2335 cur->refs = NULL;
2336 cur->_private = NULL;
2337 cur->charset = XML_CHAR_ENCODING_UTF8;
2338 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2339 if ((ExternalID != NULL) ||
2340 (URI != NULL))
2341 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2342 return(cur);
2343}
2344
2345/**
2346 * htmlNewDoc:
2347 * @URI: URI for the dtd, or NULL
2348 * @ExternalID: the external ID of the DTD, or NULL
2349 *
2350 * Creates a new HTML document
2351 *
2352 * Returns a new document
2353 */
2354htmlDocPtr
2355htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2356 if ((URI == NULL) && (ExternalID == NULL))
2357 return(htmlNewDocNoDtD(
2358 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2359 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2360
2361 return(htmlNewDocNoDtD(URI, ExternalID));
2362}
2363
2364
2365/************************************************************************
2366 * *
2367 * The parser itself *
2368 * Relates to http://www.w3.org/TR/html40 *
2369 * *
2370 ************************************************************************/
2371
2372/************************************************************************
2373 * *
2374 * The parser itself *
2375 * *
2376 ************************************************************************/
2377
2378static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2379
2380/**
2381 * htmlParseHTMLName:
2382 * @ctxt: an HTML parser context
2383 *
2384 * parse an HTML tag or attribute name, note that we convert it to lowercase
2385 * since HTML names are not case-sensitive.
2386 *
2387 * Returns the Tag Name parsed or NULL
2388 */
2389
2390static const xmlChar *
2391htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2392 int i = 0;
2393 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2394
2395 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2396 (CUR != ':') && (CUR != '.')) return(NULL);
2397
2398 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2399 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2400 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2401 (CUR == '.'))) {
2402 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2403 else loc[i] = CUR;
2404 i++;
2405
2406 NEXT;
2407 }
2408
2409 return(xmlDictLookup(ctxt->dict, loc, i));
2410}
2411
2412
2413/**
2414 * htmlParseHTMLName_nonInvasive:
2415 * @ctxt: an HTML parser context
2416 *
2417 * parse an HTML tag or attribute name, note that we convert it to lowercase
2418 * since HTML names are not case-sensitive, this doesn't consume the data
2419 * from the stream, it's a look-ahead
2420 *
2421 * Returns the Tag Name parsed or NULL
2422 */
2423
2424static const xmlChar *
2425htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2426 int i = 0;
2427 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2428
2429 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2430 (NXT(1) != ':')) return(NULL);
2431
2432 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2433 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2434 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2435 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2436 else loc[i] = NXT(1+i);
2437 i++;
2438 }
2439
2440 return(xmlDictLookup(ctxt->dict, loc, i));
2441}
2442
2443
2444/**
2445 * htmlParseName:
2446 * @ctxt: an HTML parser context
2447 *
2448 * parse an HTML name, this routine is case sensitive.
2449 *
2450 * Returns the Name parsed or NULL
2451 */
2452
2453static const xmlChar *
2454htmlParseName(htmlParserCtxtPtr ctxt) {
2455 const xmlChar *in;
2456 const xmlChar *ret;
2457 int count = 0;
2458
2459 GROW;
2460
2461 /*
2462 * Accelerator for simple ASCII names
2463 */
2464 in = ctxt->input->cur;
2465 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2466 ((*in >= 0x41) && (*in <= 0x5A)) ||
2467 (*in == '_') || (*in == ':')) {
2468 in++;
2469 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2470 ((*in >= 0x41) && (*in <= 0x5A)) ||
2471 ((*in >= 0x30) && (*in <= 0x39)) ||
2472 (*in == '_') || (*in == '-') ||
2473 (*in == ':') || (*in == '.'))
2474 in++;
2475
2476 if (in == ctxt->input->end)
2477 return(NULL);
2478
2479 if ((*in > 0) && (*in < 0x80)) {
2480 count = in - ctxt->input->cur;
2481 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2482 ctxt->input->cur = in;
2483 ctxt->nbChars += count;
2484 ctxt->input->col += count;
2485 return(ret);
2486 }
2487 }
2488 return(htmlParseNameComplex(ctxt));
2489}
2490
2491static const xmlChar *
2492htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2493 int len = 0, l;
2494 int c;
2495 int count = 0;
2496 const xmlChar *base = ctxt->input->base;
2497
2498 /*
2499 * Handler for more complex cases
2500 */
2501 GROW;
2502 c = CUR_CHAR(l);
2503 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2504 (!IS_LETTER(c) && (c != '_') &&
2505 (c != ':'))) {
2506 return(NULL);
2507 }
2508
2509 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2510 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2511 (c == '.') || (c == '-') ||
2512 (c == '_') || (c == ':') ||
2513 (IS_COMBINING(c)) ||
2514 (IS_EXTENDER(c)))) {
2515 if (count++ > 100) {
2516 count = 0;
2517 GROW;
2518 }
2519 len += l;
2520 NEXTL(l);
2521 c = CUR_CHAR(l);
2522 if (ctxt->input->base != base) {
2523 /*
2524 * We changed encoding from an unknown encoding
2525 * Input buffer changed location, so we better start again
2526 */
2527 return(htmlParseNameComplex(ctxt));
2528 }
2529 }
2530
2531 if (ctxt->input->base > ctxt->input->cur - len)
2532 return(NULL);
2533
2534 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2535}
2536
2537
2538/**
2539 * htmlParseHTMLAttribute:
2540 * @ctxt: an HTML parser context
2541 * @stop: a char stop value
2542 *
2543 * parse an HTML attribute value till the stop (quote), if
2544 * stop is 0 then it stops at the first space
2545 *
2546 * Returns the attribute parsed or NULL
2547 */
2548
2549static xmlChar *
2550htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2551 xmlChar *buffer = NULL;
2552 int buffer_size = 0;
2553 xmlChar *out = NULL;
2554 const xmlChar *name = NULL;
2555 const xmlChar *cur = NULL;
2556 const htmlEntityDesc * ent;
2557
2558 /*
2559 * allocate a translation buffer.
2560 */
2561 buffer_size = HTML_PARSER_BUFFER_SIZE;
2562 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2563 if (buffer == NULL) {
2564 htmlErrMemory(ctxt, "buffer allocation failed\n");
2565 return(NULL);
2566 }
2567 out = buffer;
2568
2569 /*
2570 * Ok loop until we reach one of the ending chars
2571 */
2572 while ((CUR != 0) && (CUR != stop)) {
2573 if ((stop == 0) && (CUR == '>')) break;
2574 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2575 if (CUR == '&') {
2576 if (NXT(1) == '#') {
2577 unsigned int c;
2578 int bits;
2579
2580 c = htmlParseCharRef(ctxt);
2581 if (c < 0x80)
2582 { *out++ = c; bits= -6; }
2583 else if (c < 0x800)
2584 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2585 else if (c < 0x10000)
2586 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2587 else
2588 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2589
2590 for ( ; bits >= 0; bits-= 6) {
2591 *out++ = ((c >> bits) & 0x3F) | 0x80;
2592 }
2593
2594 if (out - buffer > buffer_size - 100) {
2595 int indx = out - buffer;
2596
2597 growBuffer(buffer);
2598 out = &buffer[indx];
2599 }
2600 } else {
2601 ent = htmlParseEntityRef(ctxt, &name);
2602 if (name == NULL) {
2603 *out++ = '&';
2604 if (out - buffer > buffer_size - 100) {
2605 int indx = out - buffer;
2606
2607 growBuffer(buffer);
2608 out = &buffer[indx];
2609 }
2610 } else if (ent == NULL) {
2611 *out++ = '&';
2612 cur = name;
2613 while (*cur != 0) {
2614 if (out - buffer > buffer_size - 100) {
2615 int indx = out - buffer;
2616
2617 growBuffer(buffer);
2618 out = &buffer[indx];
2619 }
2620 *out++ = *cur++;
2621 }
2622 } else {
2623 unsigned int c;
2624 int bits;
2625
2626 if (out - buffer > buffer_size - 100) {
2627 int indx = out - buffer;
2628
2629 growBuffer(buffer);
2630 out = &buffer[indx];
2631 }
2632 c = ent->value;
2633 if (c < 0x80)
2634 { *out++ = c; bits= -6; }
2635 else if (c < 0x800)
2636 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2637 else if (c < 0x10000)
2638 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2639 else
2640 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2641
2642 for ( ; bits >= 0; bits-= 6) {
2643 *out++ = ((c >> bits) & 0x3F) | 0x80;
2644 }
2645 }
2646 }
2647 } else {
2648 unsigned int c;
2649 int bits, l;
2650
2651 if (out - buffer > buffer_size - 100) {
2652 int indx = out - buffer;
2653
2654 growBuffer(buffer);
2655 out = &buffer[indx];
2656 }
2657 c = CUR_CHAR(l);
2658 if (c < 0x80)
2659 { *out++ = c; bits= -6; }
2660 else if (c < 0x800)
2661 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2662 else if (c < 0x10000)
2663 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2664 else
2665 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2666
2667 for ( ; bits >= 0; bits-= 6) {
2668 *out++ = ((c >> bits) & 0x3F) | 0x80;
2669 }
2670 NEXT;
2671 }
2672 }
2673 *out = 0;
2674 return(buffer);
2675}
2676
2677/**
2678 * htmlParseEntityRef:
2679 * @ctxt: an HTML parser context
2680 * @str: location to store the entity name
2681 *
2682 * parse an HTML ENTITY references
2683 *
2684 * [68] EntityRef ::= '&' Name ';'
2685 *
2686 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2687 * if non-NULL *str will have to be freed by the caller.
2688 */
2689const htmlEntityDesc *
2690htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2691 const xmlChar *name;
2692 const htmlEntityDesc * ent = NULL;
2693
2694 if (str != NULL) *str = NULL;
2695 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2696
2697 if (CUR == '&') {
2698 NEXT;
2699 name = htmlParseName(ctxt);
2700 if (name == NULL) {
2701 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2702 "htmlParseEntityRef: no name\n", NULL, NULL);
2703 } else {
2704 GROW;
2705 if (CUR == ';') {
2706 if (str != NULL)
2707 *str = name;
2708
2709 /*
2710 * Lookup the entity in the table.
2711 */
2712 ent = htmlEntityLookup(name);
2713 if (ent != NULL) /* OK that's ugly !!! */
2714 NEXT;
2715 } else {
2716 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2717 "htmlParseEntityRef: expecting ';'\n",
2718 NULL, NULL);
2719 if (str != NULL)
2720 *str = name;
2721 }
2722 }
2723 }
2724 return(ent);
2725}
2726
2727/**
2728 * htmlParseAttValue:
2729 * @ctxt: an HTML parser context
2730 *
2731 * parse a value for an attribute
2732 * Note: the parser won't do substitution of entities here, this
2733 * will be handled later in xmlStringGetNodeList, unless it was
2734 * asked for ctxt->replaceEntities != 0
2735 *
2736 * Returns the AttValue parsed or NULL.
2737 */
2738
2739static xmlChar *
2740htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2741 xmlChar *ret = NULL;
2742
2743 if (CUR == '"') {
2744 NEXT;
2745 ret = htmlParseHTMLAttribute(ctxt, '"');
2746 if (CUR != '"') {
2747 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2748 "AttValue: \" expected\n", NULL, NULL);
2749 } else
2750 NEXT;
2751 } else if (CUR == '\'') {
2752 NEXT;
2753 ret = htmlParseHTMLAttribute(ctxt, '\'');
2754 if (CUR != '\'') {
2755 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2756 "AttValue: ' expected\n", NULL, NULL);
2757 } else
2758 NEXT;
2759 } else {
2760 /*
2761 * That's an HTMLism, the attribute value may not be quoted
2762 */
2763 ret = htmlParseHTMLAttribute(ctxt, 0);
2764 if (ret == NULL) {
2765 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2766 "AttValue: no value found\n", NULL, NULL);
2767 }
2768 }
2769 return(ret);
2770}
2771
2772/**
2773 * htmlParseSystemLiteral:
2774 * @ctxt: an HTML parser context
2775 *
2776 * parse an HTML Literal
2777 *
2778 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2779 *
2780 * Returns the SystemLiteral parsed or NULL
2781 */
2782
2783static xmlChar *
2784htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2785 size_t len = 0, startPosition = 0;
2786 xmlChar *ret = NULL;
2787
2788 if (CUR == '"') {
2789 NEXT;
2790
2791 if (CUR_PTR < BASE_PTR)
2792 return(ret);
2793 startPosition = CUR_PTR - BASE_PTR;
2794
2795 while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
2796 NEXT;
2797 len++;
2798 }
2799 if (!IS_CHAR_CH(CUR)) {
2800 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2801 "Unfinished SystemLiteral\n", NULL, NULL);
2802 } else {
2803 ret = xmlStrndup((BASE_PTR+startPosition), len);
2804 NEXT;
2805 }
2806 } else if (CUR == '\'') {
2807 NEXT;
2808
2809 if (CUR_PTR < BASE_PTR)
2810 return(ret);
2811 startPosition = CUR_PTR - BASE_PTR;
2812
2813 while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
2814 NEXT;
2815 len++;
2816 }
2817 if (!IS_CHAR_CH(CUR)) {
2818 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2819 "Unfinished SystemLiteral\n", NULL, NULL);
2820 } else {
2821 ret = xmlStrndup((BASE_PTR+startPosition), len);
2822 NEXT;
2823 }
2824 } else {
2825 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2826 " or ' expected\n", NULL, NULL);
2827 }
2828
2829 return(ret);
2830}
2831
2832/**
2833 * htmlParsePubidLiteral:
2834 * @ctxt: an HTML parser context
2835 *
2836 * parse an HTML public literal
2837 *
2838 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2839 *
2840 * Returns the PubidLiteral parsed or NULL.
2841 */
2842
2843static xmlChar *
2844htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2845 size_t len = 0, startPosition = 0;
2846 xmlChar *ret = NULL;
2847 /*
2848 * Name ::= (Letter | '_') (NameChar)*
2849 */
2850 if (CUR == '"') {
2851 NEXT;
2852
2853 if (CUR_PTR < BASE_PTR)
2854 return(ret);
2855 startPosition = CUR_PTR - BASE_PTR;
2856
2857 while (IS_PUBIDCHAR_CH(CUR)) {
2858 len++;
2859 NEXT;
2860 }
2861
2862 if (CUR != '"') {
2863 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2864 "Unfinished PubidLiteral\n", NULL, NULL);
2865 } else {
2866 ret = xmlStrndup((BASE_PTR + startPosition), len);
2867 NEXT;
2868 }
2869 } else if (CUR == '\'') {
2870 NEXT;
2871
2872 if (CUR_PTR < BASE_PTR)
2873 return(ret);
2874 startPosition = CUR_PTR - BASE_PTR;
2875
2876 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
2877 len++;
2878 NEXT;
2879 }
2880
2881 if (CUR != '\'') {
2882 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2883 "Unfinished PubidLiteral\n", NULL, NULL);
2884 } else {
2885 ret = xmlStrndup((BASE_PTR + startPosition), len);
2886 NEXT;
2887 }
2888 } else {
2889 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2890 "PubidLiteral \" or ' expected\n", NULL, NULL);
2891 }
2892
2893 return(ret);
2894}
2895
2896/**
2897 * htmlParseScript:
2898 * @ctxt: an HTML parser context
2899 *
2900 * parse the content of an HTML SCRIPT or STYLE element
2901 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2902 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2903 * http://www.w3.org/TR/html4/types.html#type-script
2904 * http://www.w3.org/TR/html4/types.html#h-6.15
2905 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2906 *
2907 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2908 * element and the value of intrinsic event attributes. User agents must
2909 * not evaluate script data as HTML markup but instead must pass it on as
2910 * data to a script engine.
2911 * NOTES:
2912 * - The content is passed like CDATA
2913 * - the attributes for style and scripting "onXXX" are also described
2914 * as CDATA but SGML allows entities references in attributes so their
2915 * processing is identical as other attributes
2916 */
2917static void
2918htmlParseScript(htmlParserCtxtPtr ctxt) {
2919 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2920 int nbchar = 0;
2921 int cur,l;
2922
2923 SHRINK;
2924 cur = CUR_CHAR(l);
2925 while (IS_CHAR_CH(cur)) {
2926 if ((cur == '<') && (NXT(1) == '/')) {
2927 /*
2928 * One should break here, the specification is clear:
2929 * Authors should therefore escape "</" within the content.
2930 * Escape mechanisms are specific to each scripting or
2931 * style sheet language.
2932 *
2933 * In recovery mode, only break if end tag match the
2934 * current tag, effectively ignoring all tags inside the
2935 * script/style block and treating the entire block as
2936 * CDATA.
2937 */
2938 if (ctxt->recovery) {
2939 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2940 xmlStrlen(ctxt->name)) == 0)
2941 {
2942 break; /* while */
2943 } else {
2944 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2945 "Element %s embeds close tag\n",
2946 ctxt->name, NULL);
2947 }
2948 } else {
2949 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2950 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2951 {
2952 break; /* while */
2953 }
2954 }
2955 }
2956 COPY_BUF(l,buf,nbchar,cur);
2957 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2958 if (ctxt->sax->cdataBlock!= NULL) {
2959 /*
2960 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2961 */
2962 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2963 } else if (ctxt->sax->characters != NULL) {
2964 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2965 }
2966 nbchar = 0;
2967 }
2968 GROW;
2969 NEXTL(l);
2970 cur = CUR_CHAR(l);
2971 }
2972
2973 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2974 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2975 "Invalid char in CDATA 0x%X\n", cur);
2976 if (ctxt->input->cur < ctxt->input->end) {
2977 NEXT;
2978 }
2979 }
2980
2981 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2982 if (ctxt->sax->cdataBlock!= NULL) {
2983 /*
2984 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2985 */
2986 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2987 } else if (ctxt->sax->characters != NULL) {
2988 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2989 }
2990 }
2991}
2992
2993
2994/**
2995 * htmlParseCharDataInternal:
2996 * @ctxt: an HTML parser context
2997 * @readahead: optional read ahead character in ascii range
2998 *
2999 * parse a CharData section.
3000 * if we are within a CDATA section ']]>' marks an end of section.
3001 *
3002 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3003 */
3004
3005static void
3006htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3007 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3008 int nbchar = 0;
3009 int cur, l;
3010 int chunk = 0;
3011
3012 if (readahead)
3013 buf[nbchar++] = readahead;
3014
3015 SHRINK;
3016 cur = CUR_CHAR(l);
3017 while (((cur != '<') || (ctxt->token == '<')) &&
3018 ((cur != '&') || (ctxt->token == '&')) &&
3019 (cur != 0)) {
3020 if (!(IS_CHAR(cur))) {
3021 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3022 "Invalid char in CDATA 0x%X\n", cur);
3023 } else {
3024 COPY_BUF(l,buf,nbchar,cur);
3025 }
3026 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3027 /*
3028 * Ok the segment is to be consumed as chars.
3029 */
3030 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3031 if (areBlanks(ctxt, buf, nbchar)) {
3032 if (ctxt->keepBlanks) {
3033 if (ctxt->sax->characters != NULL)
3034 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3035 } else {
3036 if (ctxt->sax->ignorableWhitespace != NULL)
3037 ctxt->sax->ignorableWhitespace(ctxt->userData,
3038 buf, nbchar);
3039 }
3040 } else {
3041 htmlCheckParagraph(ctxt);
3042 if (ctxt->sax->characters != NULL)
3043 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3044 }
3045 }
3046 nbchar = 0;
3047 }
3048 NEXTL(l);
3049 chunk++;
3050 if (chunk > HTML_PARSER_BUFFER_SIZE) {
3051 chunk = 0;
3052 SHRINK;
3053 GROW;
3054 }
3055 cur = CUR_CHAR(l);
3056 if (cur == 0) {
3057 SHRINK;
3058 GROW;
3059 cur = CUR_CHAR(l);
3060 }
3061 }
3062 if (nbchar != 0) {
3063 buf[nbchar] = 0;
3064
3065 /*
3066 * Ok the segment is to be consumed as chars.
3067 */
3068 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3069 if (areBlanks(ctxt, buf, nbchar)) {
3070 if (ctxt->keepBlanks) {
3071 if (ctxt->sax->characters != NULL)
3072 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3073 } else {
3074 if (ctxt->sax->ignorableWhitespace != NULL)
3075 ctxt->sax->ignorableWhitespace(ctxt->userData,
3076 buf, nbchar);
3077 }
3078 } else {
3079 htmlCheckParagraph(ctxt);
3080 if (ctxt->sax->characters != NULL)
3081 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3082 }
3083 }
3084 } else {
3085 /*
3086 * Loop detection
3087 */
3088 if (cur == 0)
3089 ctxt->instate = XML_PARSER_EOF;
3090 }
3091}
3092
3093/**
3094 * htmlParseCharData:
3095 * @ctxt: an HTML parser context
3096 *
3097 * parse a CharData section.
3098 * if we are within a CDATA section ']]>' marks an end of section.
3099 *
3100 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3101 */
3102
3103static void
3104htmlParseCharData(htmlParserCtxtPtr ctxt) {
3105 htmlParseCharDataInternal(ctxt, 0);
3106}
3107
3108/**
3109 * htmlParseExternalID:
3110 * @ctxt: an HTML parser context
3111 * @publicID: a xmlChar** receiving PubidLiteral
3112 *
3113 * Parse an External ID or a Public ID
3114 *
3115 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3116 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3117 *
3118 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3119 *
3120 * Returns the function returns SystemLiteral and in the second
3121 * case publicID receives PubidLiteral, is strict is off
3122 * it is possible to return NULL and have publicID set.
3123 */
3124
3125static xmlChar *
3126htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3127 xmlChar *URI = NULL;
3128
3129 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3130 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3131 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3132 SKIP(6);
3133 if (!IS_BLANK_CH(CUR)) {
3134 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3135 "Space required after 'SYSTEM'\n", NULL, NULL);
3136 }
3137 SKIP_BLANKS;
3138 URI = htmlParseSystemLiteral(ctxt);
3139 if (URI == NULL) {
3140 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3141 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3142 }
3143 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3144 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3145 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3146 SKIP(6);
3147 if (!IS_BLANK_CH(CUR)) {
3148 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3149 "Space required after 'PUBLIC'\n", NULL, NULL);
3150 }
3151 SKIP_BLANKS;
3152 *publicID = htmlParsePubidLiteral(ctxt);
3153 if (*publicID == NULL) {
3154 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3155 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3156 NULL, NULL);
3157 }
3158 SKIP_BLANKS;
3159 if ((CUR == '"') || (CUR == '\'')) {
3160 URI = htmlParseSystemLiteral(ctxt);
3161 }
3162 }
3163 return(URI);
3164}
3165
3166/**
3167 * xmlParsePI:
3168 * @ctxt: an XML parser context
3169 *
3170 * parse an XML Processing Instruction.
3171 *
3172 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3173 */
3174static void
3175htmlParsePI(htmlParserCtxtPtr ctxt) {
3176 xmlChar *buf = NULL;
3177 int len = 0;
3178 int size = HTML_PARSER_BUFFER_SIZE;
3179 int cur, l;
3180 const xmlChar *target;
3181 xmlParserInputState state;
3182 int count = 0;
3183
3184 if ((RAW == '<') && (NXT(1) == '?')) {
3185 state = ctxt->instate;
3186 ctxt->instate = XML_PARSER_PI;
3187 /*
3188 * this is a Processing Instruction.
3189 */
3190 SKIP(2);
3191 SHRINK;
3192
3193 /*
3194 * Parse the target name and check for special support like
3195 * namespace.
3196 */
3197 target = htmlParseName(ctxt);
3198 if (target != NULL) {
3199 if (RAW == '>') {
3200 SKIP(1);
3201
3202 /*
3203 * SAX: PI detected.
3204 */
3205 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3206 (ctxt->sax->processingInstruction != NULL))
3207 ctxt->sax->processingInstruction(ctxt->userData,
3208 target, NULL);
3209 ctxt->instate = state;
3210 return;
3211 }
3212 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3213 if (buf == NULL) {
3214 htmlErrMemory(ctxt, NULL);
3215 ctxt->instate = state;
3216 return;
3217 }
3218 cur = CUR;
3219 if (!IS_BLANK(cur)) {
3220 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3221 "ParsePI: PI %s space expected\n", target, NULL);
3222 }
3223 SKIP_BLANKS;
3224 cur = CUR_CHAR(l);
3225 while (IS_CHAR(cur) && (cur != '>')) {
3226 if (len + 5 >= size) {
3227 xmlChar *tmp;
3228
3229 size *= 2;
3230 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3231 if (tmp == NULL) {
3232 htmlErrMemory(ctxt, NULL);
3233 xmlFree(buf);
3234 ctxt->instate = state;
3235 return;
3236 }
3237 buf = tmp;
3238 }
3239 count++;
3240 if (count > 50) {
3241 GROW;
3242 count = 0;
3243 }
3244 COPY_BUF(l,buf,len,cur);
3245 NEXTL(l);
3246 cur = CUR_CHAR(l);
3247 if (cur == 0) {
3248 SHRINK;
3249 GROW;
3250 cur = CUR_CHAR(l);
3251 }
3252 }
3253 buf[len] = 0;
3254 if (cur != '>') {
3255 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3256 "ParsePI: PI %s never end ...\n", target, NULL);
3257 } else {
3258 SKIP(1);
3259
3260 /*
3261 * SAX: PI detected.
3262 */
3263 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3264 (ctxt->sax->processingInstruction != NULL))
3265 ctxt->sax->processingInstruction(ctxt->userData,
3266 target, buf);
3267 }
3268 xmlFree(buf);
3269 } else {
3270 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3271 "PI is not started correctly", NULL, NULL);
3272 }
3273 ctxt->instate = state;
3274 }
3275}
3276
3277/**
3278 * htmlParseComment:
3279 * @ctxt: an HTML parser context
3280 *
3281 * Parse an XML (SGML) comment <!-- .... -->
3282 *
3283 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3284 */
3285static void
3286htmlParseComment(htmlParserCtxtPtr ctxt) {
3287 xmlChar *buf = NULL;
3288 int len;
3289 int size = HTML_PARSER_BUFFER_SIZE;
3290 int q, ql;
3291 int r, rl;
3292 int cur, l;
3293 xmlParserInputState state;
3294
3295 /*
3296 * Check that there is a comment right here.
3297 */
3298 if ((RAW != '<') || (NXT(1) != '!') ||
3299 (NXT(2) != '-') || (NXT(3) != '-')) return;
3300
3301 state = ctxt->instate;
3302 ctxt->instate = XML_PARSER_COMMENT;
3303 SHRINK;
3304 SKIP(4);
3305 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3306 if (buf == NULL) {
3307 htmlErrMemory(ctxt, "buffer allocation failed\n");
3308 ctxt->instate = state;
3309 return;
3310 }
3311 len = 0;
3312 buf[len] = 0;
3313 q = CUR_CHAR(ql);
3314 if (!IS_CHAR(q))
3315 goto unfinished;
3316 NEXTL(ql);
3317 r = CUR_CHAR(rl);
3318 if (!IS_CHAR(r))
3319 goto unfinished;
3320 NEXTL(rl);
3321 cur = CUR_CHAR(l);
3322 while (IS_CHAR(cur) &&
3323 ((cur != '>') ||
3324 (r != '-') || (q != '-'))) {
3325 if (len + 5 >= size) {
3326 xmlChar *tmp;
3327
3328 size *= 2;
3329 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3330 if (tmp == NULL) {
3331 xmlFree(buf);
3332 htmlErrMemory(ctxt, "growing buffer failed\n");
3333 ctxt->instate = state;
3334 return;
3335 }
3336 buf = tmp;
3337 }
3338 COPY_BUF(ql,buf,len,q);
3339 q = r;
3340 ql = rl;
3341 r = cur;
3342 rl = l;
3343 NEXTL(l);
3344 cur = CUR_CHAR(l);
3345 if (cur == 0) {
3346 SHRINK;
3347 GROW;
3348 cur = CUR_CHAR(l);
3349 }
3350 }
3351 buf[len] = 0;
3352 if (IS_CHAR(cur)) {
3353 NEXT;
3354 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3355 (!ctxt->disableSAX))
3356 ctxt->sax->comment(ctxt->userData, buf);
3357 xmlFree(buf);
3358 ctxt->instate = state;
3359 return;
3360 }
3361
3362unfinished:
3363 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3364 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3365 xmlFree(buf);
3366}
3367
3368/**
3369 * htmlParseCharRef:
3370 * @ctxt: an HTML parser context
3371 *
3372 * parse Reference declarations
3373 *
3374 * [66] CharRef ::= '&#' [0-9]+ ';' |
3375 * '&#x' [0-9a-fA-F]+ ';'
3376 *
3377 * Returns the value parsed (as an int)
3378 */
3379int
3380htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3381 int val = 0;
3382
3383 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3384 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3385 "htmlParseCharRef: context error\n",
3386 NULL, NULL);
3387 return(0);
3388 }
3389 if ((CUR == '&') && (NXT(1) == '#') &&
3390 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3391 SKIP(3);
3392 while (CUR != ';') {
3393 if ((CUR >= '0') && (CUR <= '9'))
3394 val = val * 16 + (CUR - '0');
3395 else if ((CUR >= 'a') && (CUR <= 'f'))
3396 val = val * 16 + (CUR - 'a') + 10;
3397 else if ((CUR >= 'A') && (CUR <= 'F'))
3398 val = val * 16 + (CUR - 'A') + 10;
3399 else {
3400 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3401 "htmlParseCharRef: missing semicolon\n",
3402 NULL, NULL);
3403 break;
3404 }
3405 NEXT;
3406 }
3407 if (CUR == ';')
3408 NEXT;
3409 } else if ((CUR == '&') && (NXT(1) == '#')) {
3410 SKIP(2);
3411 while (CUR != ';') {
3412 if ((CUR >= '0') && (CUR <= '9'))
3413 val = val * 10 + (CUR - '0');
3414 else {
3415 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3416 "htmlParseCharRef: missing semicolon\n",
3417 NULL, NULL);
3418 break;
3419 }
3420 NEXT;
3421 }
3422 if (CUR == ';')
3423 NEXT;
3424 } else {
3425 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3426 "htmlParseCharRef: invalid value\n", NULL, NULL);
3427 }
3428 /*
3429 * Check the value IS_CHAR ...
3430 */
3431 if (IS_CHAR(val)) {
3432 return(val);
3433 } else {
3434 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3435 "htmlParseCharRef: invalid xmlChar value %d\n",
3436 val);
3437 }
3438 return(0);
3439}
3440
3441
3442/**
3443 * htmlParseDocTypeDecl:
3444 * @ctxt: an HTML parser context
3445 *
3446 * parse a DOCTYPE declaration
3447 *
3448 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3449 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3450 */
3451
3452static void
3453htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3454 const xmlChar *name;
3455 xmlChar *ExternalID = NULL;
3456 xmlChar *URI = NULL;
3457
3458 /*
3459 * We know that '<!DOCTYPE' has been detected.
3460 */
3461 SKIP(9);
3462
3463 SKIP_BLANKS;
3464
3465 /*
3466 * Parse the DOCTYPE name.
3467 */
3468 name = htmlParseName(ctxt);
3469 if (name == NULL) {
3470 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3471 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3472 NULL, NULL);
3473 }
3474 /*
3475 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3476 */
3477
3478 SKIP_BLANKS;
3479
3480 /*
3481 * Check for SystemID and ExternalID
3482 */
3483 URI = htmlParseExternalID(ctxt, &ExternalID);
3484 SKIP_BLANKS;
3485
3486 /*
3487 * We should be at the end of the DOCTYPE declaration.
3488 */
3489 if (CUR != '>') {
3490 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3491 "DOCTYPE improperly terminated\n", NULL, NULL);
3492 /* We shouldn't try to resynchronize ... */
3493 }
3494 NEXT;
3495
3496 /*
3497 * Create or update the document accordingly to the DOCTYPE
3498 */
3499 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3500 (!ctxt->disableSAX))
3501 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3502
3503 /*
3504 * Cleanup, since we don't use all those identifiers
3505 */
3506 if (URI != NULL) xmlFree(URI);
3507 if (ExternalID != NULL) xmlFree(ExternalID);
3508}
3509
3510/**
3511 * htmlParseAttribute:
3512 * @ctxt: an HTML parser context
3513 * @value: a xmlChar ** used to store the value of the attribute
3514 *
3515 * parse an attribute
3516 *
3517 * [41] Attribute ::= Name Eq AttValue
3518 *
3519 * [25] Eq ::= S? '=' S?
3520 *
3521 * With namespace:
3522 *
3523 * [NS 11] Attribute ::= QName Eq AttValue
3524 *
3525 * Also the case QName == xmlns:??? is handled independently as a namespace
3526 * definition.
3527 *
3528 * Returns the attribute name, and the value in *value.
3529 */
3530
3531static const xmlChar *
3532htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3533 const xmlChar *name;
3534 xmlChar *val = NULL;
3535
3536 *value = NULL;
3537 name = htmlParseHTMLName(ctxt);
3538 if (name == NULL) {
3539 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3540 "error parsing attribute name\n", NULL, NULL);
3541 return(NULL);
3542 }
3543
3544 /*
3545 * read the value
3546 */
3547 SKIP_BLANKS;
3548 if (CUR == '=') {
3549 NEXT;
3550 SKIP_BLANKS;
3551 val = htmlParseAttValue(ctxt);
3552 }
3553
3554 *value = val;
3555 return(name);
3556}
3557
3558/**
3559 * htmlCheckEncodingDirect:
3560 * @ctxt: an HTML parser context
3561 * @attvalue: the attribute value
3562 *
3563 * Checks an attribute value to detect
3564 * the encoding
3565 * If a new encoding is detected the parser is switched to decode
3566 * it and pass UTF8
3567 */
3568static void
3569htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3570
3571 if ((ctxt == NULL) || (encoding == NULL) ||
3572 (ctxt->options & HTML_PARSE_IGNORE_ENC))
3573 return;
3574
3575 /* do not change encoding */
3576 if (ctxt->input->encoding != NULL)
3577 return;
3578
3579 if (encoding != NULL) {
3580 xmlCharEncoding enc;
3581 xmlCharEncodingHandlerPtr handler;
3582
3583 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3584
3585 if (ctxt->input->encoding != NULL)
3586 xmlFree((xmlChar *) ctxt->input->encoding);
3587 ctxt->input->encoding = xmlStrdup(encoding);
3588
3589 enc = xmlParseCharEncoding((const char *) encoding);
3590 /*
3591 * registered set of known encodings
3592 */
3593 if (enc != XML_CHAR_ENCODING_ERROR) {
3594 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3595 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3596 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3597 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3598 (ctxt->input->buf != NULL) &&
3599 (ctxt->input->buf->encoder == NULL)) {
3600 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3601 "htmlCheckEncoding: wrong encoding meta\n",
3602 NULL, NULL);
3603 } else {
3604 xmlSwitchEncoding(ctxt, enc);
3605 }
3606 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3607 } else {
3608 /*
3609 * fallback for unknown encodings
3610 */
3611 handler = xmlFindCharEncodingHandler((const char *) encoding);
3612 if (handler != NULL) {
3613 xmlSwitchToEncoding(ctxt, handler);
3614 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3615 } else {
3616 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3617 "htmlCheckEncoding: unknown encoding %s\n",
3618 encoding, NULL);
3619 }
3620 }
3621
3622 if ((ctxt->input->buf != NULL) &&
3623 (ctxt->input->buf->encoder != NULL) &&
3624 (ctxt->input->buf->raw != NULL) &&
3625 (ctxt->input->buf->buffer != NULL)) {
3626 int nbchars;
3627 int processed;
3628
3629 /*
3630 * convert as much as possible to the parser reading buffer.
3631 */
3632 processed = ctxt->input->cur - ctxt->input->base;
3633 xmlBufShrink(ctxt->input->buf->buffer, processed);
3634 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3635 if (nbchars < 0) {
3636 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3637 "htmlCheckEncoding: encoder error\n",
3638 NULL, NULL);
3639 }
3640 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3641 }
3642 }
3643}
3644
3645/**
3646 * htmlCheckEncoding:
3647 * @ctxt: an HTML parser context
3648 * @attvalue: the attribute value
3649 *
3650 * Checks an http-equiv attribute from a Meta tag to detect
3651 * the encoding
3652 * If a new encoding is detected the parser is switched to decode
3653 * it and pass UTF8
3654 */
3655static void
3656htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3657 const xmlChar *encoding;
3658
3659 if (!attvalue)
3660 return;
3661
3662 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3663 if (encoding != NULL) {
3664 encoding += 7;
3665 }
3666 /*
3667 * skip blank
3668 */
3669 if (encoding && IS_BLANK_CH(*encoding))
3670 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3671 if (encoding && *encoding == '=') {
3672 encoding ++;
3673 htmlCheckEncodingDirect(ctxt, encoding);
3674 }
3675}
3676
3677/**
3678 * htmlCheckMeta:
3679 * @ctxt: an HTML parser context
3680 * @atts: the attributes values
3681 *
3682 * Checks an attributes from a Meta tag
3683 */
3684static void
3685htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3686 int i;
3687 const xmlChar *att, *value;
3688 int http = 0;
3689 const xmlChar *content = NULL;
3690
3691 if ((ctxt == NULL) || (atts == NULL))
3692 return;
3693
3694 i = 0;
3695 att = atts[i++];
3696 while (att != NULL) {
3697 value = atts[i++];
3698 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3699 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3700 http = 1;
3701 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3702 htmlCheckEncodingDirect(ctxt, value);
3703 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3704 content = value;
3705 att = atts[i++];
3706 }
3707 if ((http) && (content != NULL))
3708 htmlCheckEncoding(ctxt, content);
3709
3710}
3711
3712/**
3713 * htmlParseStartTag:
3714 * @ctxt: an HTML parser context
3715 *
3716 * parse a start of tag either for rule element or
3717 * EmptyElement. In both case we don't parse the tag closing chars.
3718 *
3719 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3720 *
3721 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3722 *
3723 * With namespace:
3724 *
3725 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3726 *
3727 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3728 *
3729 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3730 */
3731
3732static int
3733htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3734 const xmlChar *name;
3735 const xmlChar *attname;
3736 xmlChar *attvalue;
3737 const xmlChar **atts;
3738 int nbatts = 0;
3739 int maxatts;
3740 int meta = 0;
3741 int i;
3742 int discardtag = 0;
3743
3744 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3745 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3746 "htmlParseStartTag: context error\n", NULL, NULL);
3747 return -1;
3748 }
3749 if (ctxt->instate == XML_PARSER_EOF)
3750 return(-1);
3751 if (CUR != '<') return -1;
3752 NEXT;
3753
3754 atts = ctxt->atts;
3755 maxatts = ctxt->maxatts;
3756
3757 GROW;
3758 name = htmlParseHTMLName(ctxt);
3759 if (name == NULL) {
3760 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3761 "htmlParseStartTag: invalid element name\n",
3762 NULL, NULL);
3763 /* if recover preserve text on classic misconstructs */
3764 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3765 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3766 htmlParseCharDataInternal(ctxt, '<');
3767 return(-1);
3768 }
3769
3770
3771 /* Dump the bogus tag like browsers do */
3772 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3773 (ctxt->instate != XML_PARSER_EOF))
3774 NEXT;
3775 return -1;
3776 }
3777 if (xmlStrEqual(name, BAD_CAST"meta"))
3778 meta = 1;
3779
3780 /*
3781 * Check for auto-closure of HTML elements.
3782 */
3783 htmlAutoClose(ctxt, name);
3784
3785 /*
3786 * Check for implied HTML elements.
3787 */
3788 htmlCheckImplied(ctxt, name);
3789
3790 /*
3791 * Avoid html at any level > 0, head at any level != 1
3792 * or any attempt to recurse body
3793 */
3794 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3795 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3796 "htmlParseStartTag: misplaced <html> tag\n",
3797 name, NULL);
3798 discardtag = 1;
3799 ctxt->depth++;
3800 }
3801 if ((ctxt->nameNr != 1) &&
3802 (xmlStrEqual(name, BAD_CAST"head"))) {
3803 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3804 "htmlParseStartTag: misplaced <head> tag\n",
3805 name, NULL);
3806 discardtag = 1;
3807 ctxt->depth++;
3808 }
3809 if (xmlStrEqual(name, BAD_CAST"body")) {
3810 int indx;
3811 for (indx = 0;indx < ctxt->nameNr;indx++) {
3812 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3813 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3814 "htmlParseStartTag: misplaced <body> tag\n",
3815 name, NULL);
3816 discardtag = 1;
3817 ctxt->depth++;
3818 }
3819 }
3820 }
3821
3822 /*
3823 * Now parse the attributes, it ends up with the ending
3824 *
3825 * (S Attribute)* S?
3826 */
3827 SKIP_BLANKS;
3828 while ((IS_CHAR_CH(CUR)) &&
3829 (CUR != '>') &&
3830 ((CUR != '/') || (NXT(1) != '>'))) {
3831 long cons = ctxt->nbChars;
3832
3833 GROW;
3834 attname = htmlParseAttribute(ctxt, &attvalue);
3835 if (attname != NULL) {
3836
3837 /*
3838 * Well formedness requires at most one declaration of an attribute
3839 */
3840 for (i = 0; i < nbatts;i += 2) {
3841 if (xmlStrEqual(atts[i], attname)) {
3842 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3843 "Attribute %s redefined\n", attname, NULL);
3844 if (attvalue != NULL)
3845 xmlFree(attvalue);
3846 goto failed;
3847 }
3848 }
3849
3850 /*
3851 * Add the pair to atts
3852 */
3853 if (atts == NULL) {
3854 maxatts = 22; /* allow for 10 attrs by default */
3855 atts = (const xmlChar **)
3856 xmlMalloc(maxatts * sizeof(xmlChar *));
3857 if (atts == NULL) {
3858 htmlErrMemory(ctxt, NULL);
3859 if (attvalue != NULL)
3860 xmlFree(attvalue);
3861 goto failed;
3862 }
3863 ctxt->atts = atts;
3864 ctxt->maxatts = maxatts;
3865 } else if (nbatts + 4 > maxatts) {
3866 const xmlChar **n;
3867
3868 maxatts *= 2;
3869 n = (const xmlChar **) xmlRealloc((void *) atts,
3870 maxatts * sizeof(const xmlChar *));
3871 if (n == NULL) {
3872 htmlErrMemory(ctxt, NULL);
3873 if (attvalue != NULL)
3874 xmlFree(attvalue);
3875 goto failed;
3876 }
3877 atts = n;
3878 ctxt->atts = atts;
3879 ctxt->maxatts = maxatts;
3880 }
3881 atts[nbatts++] = attname;
3882 atts[nbatts++] = attvalue;
3883 atts[nbatts] = NULL;
3884 atts[nbatts + 1] = NULL;
3885 }
3886 else {
3887 if (attvalue != NULL)
3888 xmlFree(attvalue);
3889 /* Dump the bogus attribute string up to the next blank or
3890 * the end of the tag. */
3891 while ((IS_CHAR_CH(CUR)) &&
3892 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3893 ((CUR != '/') || (NXT(1) != '>')))
3894 NEXT;
3895 }
3896
3897failed:
3898 SKIP_BLANKS;
3899 if (cons == ctxt->nbChars) {
3900 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3901 "htmlParseStartTag: problem parsing attributes\n",
3902 NULL, NULL);
3903 break;
3904 }
3905 }
3906
3907 /*
3908 * Handle specific association to the META tag
3909 */
3910 if (meta && (nbatts != 0))
3911 htmlCheckMeta(ctxt, atts);
3912
3913 /*
3914 * SAX: Start of Element !
3915 */
3916 if (!discardtag) {
3917 htmlnamePush(ctxt, name);
3918 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3919 if (nbatts != 0)
3920 ctxt->sax->startElement(ctxt->userData, name, atts);
3921 else
3922 ctxt->sax->startElement(ctxt->userData, name, NULL);
3923 }
3924 }
3925
3926 if (atts != NULL) {
3927 for (i = 1;i < nbatts;i += 2) {
3928 if (atts[i] != NULL)
3929 xmlFree((xmlChar *) atts[i]);
3930 }
3931 }
3932
3933 return(discardtag);
3934}
3935
3936/**
3937 * htmlParseEndTag:
3938 * @ctxt: an HTML parser context
3939 *
3940 * parse an end of tag
3941 *
3942 * [42] ETag ::= '</' Name S? '>'
3943 *
3944 * With namespace
3945 *
3946 * [NS 9] ETag ::= '</' QName S? '>'
3947 *
3948 * Returns 1 if the current level should be closed.
3949 */
3950
3951static int
3952htmlParseEndTag(htmlParserCtxtPtr ctxt)
3953{
3954 const xmlChar *name;
3955 const xmlChar *oldname;
3956 int i, ret;
3957
3958 if ((CUR != '<') || (NXT(1) != '/')) {
3959 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3960 "htmlParseEndTag: '</' not found\n", NULL, NULL);
3961 return (0);
3962 }
3963 SKIP(2);
3964
3965 name = htmlParseHTMLName(ctxt);
3966 if (name == NULL)
3967 return (0);
3968 /*
3969 * We should definitely be at the ending "S? '>'" part
3970 */
3971 SKIP_BLANKS;
3972 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3973 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3974 "End tag : expected '>'\n", NULL, NULL);
3975 if (ctxt->recovery) {
3976 /*
3977 * We're not at the ending > !!
3978 * Error, unless in recover mode where we search forwards
3979 * until we find a >
3980 */
3981 while (CUR != '\0' && CUR != '>') NEXT;
3982 NEXT;
3983 }
3984 } else
3985 NEXT;
3986
3987 /*
3988 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3989 * out now.
3990 */
3991 if ((ctxt->depth > 0) &&
3992 (xmlStrEqual(name, BAD_CAST "html") ||
3993 xmlStrEqual(name, BAD_CAST "body") ||
3994 xmlStrEqual(name, BAD_CAST "head"))) {
3995 ctxt->depth--;
3996 return (0);
3997 }
3998
3999 /*
4000 * If the name read is not one of the element in the parsing stack
4001 * then return, it's just an error.
4002 */
4003 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4004 if (xmlStrEqual(name, ctxt->nameTab[i]))
4005 break;
4006 }
4007 if (i < 0) {
4008 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4009 "Unexpected end tag : %s\n", name, NULL);
4010 return (0);
4011 }
4012
4013
4014 /*
4015 * Check for auto-closure of HTML elements.
4016 */
4017
4018 htmlAutoCloseOnClose(ctxt, name);
4019
4020 /*
4021 * Well formedness constraints, opening and closing must match.
4022 * With the exception that the autoclose may have popped stuff out
4023 * of the stack.
4024 */
4025 if (!xmlStrEqual(name, ctxt->name)) {
4026 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4027 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4028 "Opening and ending tag mismatch: %s and %s\n",
4029 name, ctxt->name);
4030 }
4031 }
4032
4033 /*
4034 * SAX: End of Tag
4035 */
4036 oldname = ctxt->name;
4037 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4038 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4039 ctxt->sax->endElement(ctxt->userData, name);
4040 htmlNodeInfoPop(ctxt);
4041 htmlnamePop(ctxt);
4042 ret = 1;
4043 } else {
4044 ret = 0;
4045 }
4046
4047 return (ret);
4048}
4049
4050
4051/**
4052 * htmlParseReference:
4053 * @ctxt: an HTML parser context
4054 *
4055 * parse and handle entity references in content,
4056 * this will end-up in a call to character() since this is either a
4057 * CharRef, or a predefined entity.
4058 */
4059static void
4060htmlParseReference(htmlParserCtxtPtr ctxt) {
4061 const htmlEntityDesc * ent;
4062 xmlChar out[6];
4063 const xmlChar *name;
4064 if (CUR != '&') return;
4065
4066 if (NXT(1) == '#') {
4067 unsigned int c;
4068 int bits, i = 0;
4069
4070 c = htmlParseCharRef(ctxt);
4071 if (c == 0)
4072 return;
4073
4074 if (c < 0x80) { out[i++]= c; bits= -6; }
4075 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4076 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4077 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4078
4079 for ( ; bits >= 0; bits-= 6) {
4080 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4081 }
4082 out[i] = 0;
4083
4084 htmlCheckParagraph(ctxt);
4085 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4086 ctxt->sax->characters(ctxt->userData, out, i);
4087 } else {
4088 ent = htmlParseEntityRef(ctxt, &name);
4089 if (name == NULL) {
4090 htmlCheckParagraph(ctxt);
4091 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4092 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4093 return;
4094 }
4095 if ((ent == NULL) || !(ent->value > 0)) {
4096 htmlCheckParagraph(ctxt);
4097 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4098 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4099 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4100 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4101 }
4102 } else {
4103 unsigned int c;
4104 int bits, i = 0;
4105
4106 c = ent->value;
4107 if (c < 0x80)
4108 { out[i++]= c; bits= -6; }
4109 else if (c < 0x800)
4110 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4111 else if (c < 0x10000)
4112 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4113 else
4114 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4115
4116 for ( ; bits >= 0; bits-= 6) {
4117 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4118 }
4119 out[i] = 0;
4120
4121 htmlCheckParagraph(ctxt);
4122 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4123 ctxt->sax->characters(ctxt->userData, out, i);
4124 }
4125 }
4126}
4127
4128/**
4129 * htmlParseContent:
4130 * @ctxt: an HTML parser context
4131 *
4132 * Parse a content: comment, sub-element, reference or text.
4133 * Kept for compatibility with old code
4134 */
4135
4136static void
4137htmlParseContent(htmlParserCtxtPtr ctxt) {
4138 xmlChar *currentNode;
4139 int depth;
4140 const xmlChar *name;
4141
4142 currentNode = xmlStrdup(ctxt->name);
4143 depth = ctxt->nameNr;
4144 while (1) {
4145 long cons = ctxt->nbChars;
4146
4147 GROW;
4148
4149 if (ctxt->instate == XML_PARSER_EOF)
4150 break;
4151
4152 /*
4153 * Our tag or one of it's parent or children is ending.
4154 */
4155 if ((CUR == '<') && (NXT(1) == '/')) {
4156 if (htmlParseEndTag(ctxt) &&
4157 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4158 if (currentNode != NULL)
4159 xmlFree(currentNode);
4160 return;
4161 }
4162 continue; /* while */
4163 }
4164
4165 else if ((CUR == '<') &&
4166 ((IS_ASCII_LETTER(NXT(1))) ||
4167 (NXT(1) == '_') || (NXT(1) == ':'))) {
4168 name = htmlParseHTMLName_nonInvasive(ctxt);
4169 if (name == NULL) {
4170 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4171 "htmlParseStartTag: invalid element name\n",
4172 NULL, NULL);
4173 /* Dump the bogus tag like browsers do */
4174 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4175 NEXT;
4176
4177 if (currentNode != NULL)
4178 xmlFree(currentNode);
4179 return;
4180 }
4181
4182 if (ctxt->name != NULL) {
4183 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4184 htmlAutoClose(ctxt, name);
4185 continue;
4186 }
4187 }
4188 }
4189
4190 /*
4191 * Has this node been popped out during parsing of
4192 * the next element
4193 */
4194 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4195 (!xmlStrEqual(currentNode, ctxt->name)))
4196 {
4197 if (currentNode != NULL) xmlFree(currentNode);
4198 return;
4199 }
4200
4201 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4202 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4203 /*
4204 * Handle SCRIPT/STYLE separately
4205 */
4206 htmlParseScript(ctxt);
4207 } else {
4208 /*
4209 * Sometimes DOCTYPE arrives in the middle of the document
4210 */
4211 if ((CUR == '<') && (NXT(1) == '!') &&
4212 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4213 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4214 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4215 (UPP(8) == 'E')) {
4216 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4217 "Misplaced DOCTYPE declaration\n",
4218 BAD_CAST "DOCTYPE" , NULL);
4219 htmlParseDocTypeDecl(ctxt);
4220 }
4221
4222 /*
4223 * First case : a comment
4224 */
4225 if ((CUR == '<') && (NXT(1) == '!') &&
4226 (NXT(2) == '-') && (NXT(3) == '-')) {
4227 htmlParseComment(ctxt);
4228 }
4229
4230 /*
4231 * Second case : a Processing Instruction.
4232 */
4233 else if ((CUR == '<') && (NXT(1) == '?')) {
4234 htmlParsePI(ctxt);
4235 }
4236
4237 /*
4238 * Third case : a sub-element.
4239 */
4240 else if (CUR == '<') {
4241 htmlParseElement(ctxt);
4242 }
4243
4244 /*
4245 * Fourth case : a reference. If if has not been resolved,
4246 * parsing returns it's Name, create the node
4247 */
4248 else if (CUR == '&') {
4249 htmlParseReference(ctxt);
4250 }
4251
4252 /*
4253 * Fifth case : end of the resource
4254 */
4255 else if (CUR == 0) {
4256 htmlAutoCloseOnEnd(ctxt);
4257 break;
4258 }
4259
4260 /*
4261 * Last case, text. Note that References are handled directly.
4262 */
4263 else {
4264 htmlParseCharData(ctxt);
4265 }
4266
4267 if (cons == ctxt->nbChars) {
4268 if (ctxt->node != NULL) {
4269 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4270 "detected an error in element content\n",
4271 NULL, NULL);
4272 }
4273 break;
4274 }
4275 }
4276 GROW;
4277 }
4278 if (currentNode != NULL) xmlFree(currentNode);
4279}
4280
4281/**
4282 * htmlParseElement:
4283 * @ctxt: an HTML parser context
4284 *
4285 * parse an HTML element, this is highly recursive
4286 * this is kept for compatibility with previous code versions
4287 *
4288 * [39] element ::= EmptyElemTag | STag content ETag
4289 *
4290 * [41] Attribute ::= Name Eq AttValue
4291 */
4292
4293void
4294htmlParseElement(htmlParserCtxtPtr ctxt) {
4295 const xmlChar *name;
4296 xmlChar *currentNode = NULL;
4297 const htmlElemDesc * info;
4298 htmlParserNodeInfo node_info;
4299 int failed;
4300 int depth;
4301 const xmlChar *oldptr;
4302
4303 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4304 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4305 "htmlParseElement: context error\n", NULL, NULL);
4306 return;
4307 }
4308
4309 if (ctxt->instate == XML_PARSER_EOF)
4310 return;
4311
4312 /* Capture start position */
4313 if (ctxt->record_info) {
4314 node_info.begin_pos = ctxt->input->consumed +
4315 (CUR_PTR - ctxt->input->base);
4316 node_info.begin_line = ctxt->input->line;
4317 }
4318
4319 failed = htmlParseStartTag(ctxt);
4320 name = ctxt->name;
4321 if ((failed == -1) || (name == NULL)) {
4322 if (CUR == '>')
4323 NEXT;
4324 return;
4325 }
4326
4327 /*
4328 * Lookup the info for that element.
4329 */
4330 info = htmlTagLookup(name);
4331 if (info == NULL) {
4332 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4333 "Tag %s invalid\n", name, NULL);
4334 }
4335
4336 /*
4337 * Check for an Empty Element labeled the XML/SGML way
4338 */
4339 if ((CUR == '/') && (NXT(1) == '>')) {
4340 SKIP(2);
4341 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4342 ctxt->sax->endElement(ctxt->userData, name);
4343 htmlnamePop(ctxt);
4344 return;
4345 }
4346
4347 if (CUR == '>') {
4348 NEXT;
4349 } else {
4350 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4351 "Couldn't find end of Start Tag %s\n", name, NULL);
4352
4353 /*
4354 * end of parsing of this node.
4355 */
4356 if (xmlStrEqual(name, ctxt->name)) {
4357 nodePop(ctxt);
4358 htmlnamePop(ctxt);
4359 }
4360
4361 /*
4362 * Capture end position and add node
4363 */
4364 if (ctxt->record_info) {
4365 node_info.end_pos = ctxt->input->consumed +
4366 (CUR_PTR - ctxt->input->base);
4367 node_info.end_line = ctxt->input->line;
4368 node_info.node = ctxt->node;
4369 xmlParserAddNodeInfo(ctxt, &node_info);
4370 }
4371 return;
4372 }
4373
4374 /*
4375 * Check for an Empty Element from DTD definition
4376 */
4377 if ((info != NULL) && (info->empty)) {
4378 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4379 ctxt->sax->endElement(ctxt->userData, name);
4380 htmlnamePop(ctxt);
4381 return;
4382 }
4383
4384 /*
4385 * Parse the content of the element:
4386 */
4387 currentNode = xmlStrdup(ctxt->name);
4388 depth = ctxt->nameNr;
4389 while (IS_CHAR_CH(CUR)) {
4390 oldptr = ctxt->input->cur;
4391 htmlParseContent(ctxt);
4392 if (oldptr==ctxt->input->cur) break;
4393 if (ctxt->nameNr < depth) break;
4394 }
4395
4396 /*
4397 * Capture end position and add node
4398 */
4399 if ( currentNode != NULL && ctxt->record_info ) {
4400 node_info.end_pos = ctxt->input->consumed +
4401 (CUR_PTR - ctxt->input->base);
4402 node_info.end_line = ctxt->input->line;
4403 node_info.node = ctxt->node;
4404 xmlParserAddNodeInfo(ctxt, &node_info);
4405 }
4406 if (!IS_CHAR_CH(CUR)) {
4407 htmlAutoCloseOnEnd(ctxt);
4408 }
4409
4410 if (currentNode != NULL)
4411 xmlFree(currentNode);
4412}
4413
4414static void
4415htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4416 /*
4417 * Capture end position and add node
4418 */
4419 if ( ctxt->node != NULL && ctxt->record_info ) {
4420 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4421 (CUR_PTR - ctxt->input->base);
4422 ctxt->nodeInfo->end_line = ctxt->input->line;
4423 ctxt->nodeInfo->node = ctxt->node;
4424 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4425 htmlNodeInfoPop(ctxt);
4426 }
4427 if (!IS_CHAR_CH(CUR)) {
4428 htmlAutoCloseOnEnd(ctxt);
4429 }
4430}
4431
4432/**
4433 * htmlParseElementInternal:
4434 * @ctxt: an HTML parser context
4435 *
4436 * parse an HTML element, new version, non recursive
4437 *
4438 * [39] element ::= EmptyElemTag | STag content ETag
4439 *
4440 * [41] Attribute ::= Name Eq AttValue
4441 */
4442
4443static void
4444htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4445 const xmlChar *name;
4446 const htmlElemDesc * info;
4447 htmlParserNodeInfo node_info = { 0, };
4448 int failed;
4449
4450 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4451 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4452 "htmlParseElementInternal: context error\n", NULL, NULL);
4453 return;
4454 }
4455
4456 if (ctxt->instate == XML_PARSER_EOF)
4457 return;
4458
4459 /* Capture start position */
4460 if (ctxt->record_info) {
4461 node_info.begin_pos = ctxt->input->consumed +
4462 (CUR_PTR - ctxt->input->base);
4463 node_info.begin_line = ctxt->input->line;
4464 }
4465
4466 failed = htmlParseStartTag(ctxt);
4467 name = ctxt->name;
4468 if ((failed == -1) || (name == NULL)) {
4469 if (CUR == '>')
4470 NEXT;
4471 return;
4472 }
4473
4474 /*
4475 * Lookup the info for that element.
4476 */
4477 info = htmlTagLookup(name);
4478 if (info == NULL) {
4479 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4480 "Tag %s invalid\n", name, NULL);
4481 }
4482
4483 /*
4484 * Check for an Empty Element labeled the XML/SGML way
4485 */
4486 if ((CUR == '/') && (NXT(1) == '>')) {
4487 SKIP(2);
4488 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4489 ctxt->sax->endElement(ctxt->userData, name);
4490 htmlnamePop(ctxt);
4491 return;
4492 }
4493
4494 if (CUR == '>') {
4495 NEXT;
4496 } else {
4497 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4498 "Couldn't find end of Start Tag %s\n", name, NULL);
4499
4500 /*
4501 * end of parsing of this node.
4502 */
4503 if (xmlStrEqual(name, ctxt->name)) {
4504 nodePop(ctxt);
4505 htmlnamePop(ctxt);
4506 }
4507
4508 if (ctxt->record_info)
4509 htmlNodeInfoPush(ctxt, &node_info);
4510 htmlParserFinishElementParsing(ctxt);
4511 return;
4512 }
4513
4514 /*
4515 * Check for an Empty Element from DTD definition
4516 */
4517 if ((info != NULL) && (info->empty)) {
4518 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4519 ctxt->sax->endElement(ctxt->userData, name);
4520 htmlnamePop(ctxt);
4521 return;
4522 }
4523
4524 if (ctxt->record_info)
4525 htmlNodeInfoPush(ctxt, &node_info);
4526}
4527
4528/**
4529 * htmlParseContentInternal:
4530 * @ctxt: an HTML parser context
4531 *
4532 * Parse a content: comment, sub-element, reference or text.
4533 * New version for non recursive htmlParseElementInternal
4534 */
4535
4536static void
4537htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4538 xmlChar *currentNode;
4539 int depth;
4540 const xmlChar *name;
4541
4542 currentNode = xmlStrdup(ctxt->name);
4543 depth = ctxt->nameNr;
4544 while (1) {
4545 long cons = ctxt->nbChars;
4546
4547 GROW;
4548
4549 if (ctxt->instate == XML_PARSER_EOF)
4550 break;
4551
4552 /*
4553 * Our tag or one of it's parent or children is ending.
4554 */
4555 if ((CUR == '<') && (NXT(1) == '/')) {
4556 if (htmlParseEndTag(ctxt) &&
4557 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4558 if (currentNode != NULL)
4559 xmlFree(currentNode);
4560
4561 currentNode = xmlStrdup(ctxt->name);
4562 depth = ctxt->nameNr;
4563 }
4564 continue; /* while */
4565 }
4566
4567 else if ((CUR == '<') &&
4568 ((IS_ASCII_LETTER(NXT(1))) ||
4569 (NXT(1) == '_') || (NXT(1) == ':'))) {
4570 name = htmlParseHTMLName_nonInvasive(ctxt);
4571 if (name == NULL) {
4572 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4573 "htmlParseStartTag: invalid element name\n",
4574 NULL, NULL);
4575 /* Dump the bogus tag like browsers do */
4576 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4577 NEXT;
4578
4579 htmlParserFinishElementParsing(ctxt);
4580 if (currentNode != NULL)
4581 xmlFree(currentNode);
4582
4583 currentNode = xmlStrdup(ctxt->name);
4584 depth = ctxt->nameNr;
4585 continue;
4586 }
4587
4588 if (ctxt->name != NULL) {
4589 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4590 htmlAutoClose(ctxt, name);
4591 continue;
4592 }
4593 }
4594 }
4595
4596 /*
4597 * Has this node been popped out during parsing of
4598 * the next element
4599 */
4600 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4601 (!xmlStrEqual(currentNode, ctxt->name)))
4602 {
4603 htmlParserFinishElementParsing(ctxt);
4604 if (currentNode != NULL) xmlFree(currentNode);
4605
4606 currentNode = xmlStrdup(ctxt->name);
4607 depth = ctxt->nameNr;
4608 continue;
4609 }
4610
4611 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4612 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4613 /*
4614 * Handle SCRIPT/STYLE separately
4615 */
4616 htmlParseScript(ctxt);
4617 } else {
4618 /*
4619 * Sometimes DOCTYPE arrives in the middle of the document
4620 */
4621 if ((CUR == '<') && (NXT(1) == '!') &&
4622 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4623 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4624 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4625 (UPP(8) == 'E')) {
4626 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4627 "Misplaced DOCTYPE declaration\n",
4628 BAD_CAST "DOCTYPE" , NULL);
4629 htmlParseDocTypeDecl(ctxt);
4630 }
4631
4632 /*
4633 * First case : a comment
4634 */
4635 if ((CUR == '<') && (NXT(1) == '!') &&
4636 (NXT(2) == '-') && (NXT(3) == '-')) {
4637 htmlParseComment(ctxt);
4638 }
4639
4640 /*
4641 * Second case : a Processing Instruction.
4642 */
4643 else if ((CUR == '<') && (NXT(1) == '?')) {
4644 htmlParsePI(ctxt);
4645 }
4646
4647 /*
4648 * Third case : a sub-element.
4649 */
4650 else if (CUR == '<') {
4651 htmlParseElementInternal(ctxt);
4652 if (currentNode != NULL) xmlFree(currentNode);
4653
4654 currentNode = xmlStrdup(ctxt->name);
4655 depth = ctxt->nameNr;
4656 }
4657
4658 /*
4659 * Fourth case : a reference. If if has not been resolved,
4660 * parsing returns it's Name, create the node
4661 */
4662 else if (CUR == '&') {
4663 htmlParseReference(ctxt);
4664 }
4665
4666 /*
4667 * Fifth case : end of the resource
4668 */
4669 else if (CUR == 0) {
4670 htmlAutoCloseOnEnd(ctxt);
4671 break;
4672 }
4673
4674 /*
4675 * Last case, text. Note that References are handled directly.
4676 */
4677 else {
4678 htmlParseCharData(ctxt);
4679 }
4680
4681 if (cons == ctxt->nbChars) {
4682 if (ctxt->node != NULL) {
4683 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4684 "detected an error in element content\n",
4685 NULL, NULL);
4686 }
4687 break;
4688 }
4689 }
4690 GROW;
4691 }
4692 if (currentNode != NULL) xmlFree(currentNode);
4693}
4694
4695/**
4696 * htmlParseContent:
4697 * @ctxt: an HTML parser context
4698 *
4699 * Parse a content: comment, sub-element, reference or text.
4700 * This is the entry point when called from parser.c
4701 */
4702
4703void
4704__htmlParseContent(void *ctxt) {
4705 if (ctxt != NULL)
4706 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4707}
4708
4709/**
4710 * htmlParseDocument:
4711 * @ctxt: an HTML parser context
4712 *
4713 * parse an HTML document (and build a tree if using the standard SAX
4714 * interface).
4715 *
4716 * Returns 0, -1 in case of error. the parser context is augmented
4717 * as a result of the parsing.
4718 */
4719
4720int
4721htmlParseDocument(htmlParserCtxtPtr ctxt) {
4722 xmlChar start[4];
4723 xmlCharEncoding enc;
4724 xmlDtdPtr dtd;
4725
4726 xmlInitParser();
4727
4728 htmlDefaultSAXHandlerInit();
4729
4730 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4731 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4732 "htmlParseDocument: context error\n", NULL, NULL);
4733 return(XML_ERR_INTERNAL_ERROR);
4734 }
4735 ctxt->html = 1;
4736 ctxt->linenumbers = 1;
4737 GROW;
4738 /*
4739 * SAX: beginning of the document processing.
4740 */
4741 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4742 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4743
4744 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4745 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4746 /*
4747 * Get the 4 first bytes and decode the charset
4748 * if enc != XML_CHAR_ENCODING_NONE
4749 * plug some encoding conversion routines.
4750 */
4751 start[0] = RAW;
4752 start[1] = NXT(1);
4753 start[2] = NXT(2);
4754 start[3] = NXT(3);
4755 enc = xmlDetectCharEncoding(&start[0], 4);
4756 if (enc != XML_CHAR_ENCODING_NONE) {
4757 xmlSwitchEncoding(ctxt, enc);
4758 }
4759 }
4760
4761 /*
4762 * Wipe out everything which is before the first '<'
4763 */
4764 SKIP_BLANKS;
4765 if (CUR == 0) {
4766 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4767 "Document is empty\n", NULL, NULL);
4768 }
4769
4770 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4771 ctxt->sax->startDocument(ctxt->userData);
4772
4773
4774 /*
4775 * Parse possible comments and PIs before any content
4776 */
4777 while (((CUR == '<') && (NXT(1) == '!') &&
4778 (NXT(2) == '-') && (NXT(3) == '-')) ||
4779 ((CUR == '<') && (NXT(1) == '?'))) {
4780 htmlParseComment(ctxt);
4781 htmlParsePI(ctxt);
4782 SKIP_BLANKS;
4783 }
4784
4785
4786 /*
4787 * Then possibly doc type declaration(s) and more Misc
4788 * (doctypedecl Misc*)?
4789 */
4790 if ((CUR == '<') && (NXT(1) == '!') &&
4791 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4792 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4793 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4794 (UPP(8) == 'E')) {
4795 htmlParseDocTypeDecl(ctxt);
4796 }
4797 SKIP_BLANKS;
4798
4799 /*
4800 * Parse possible comments and PIs before any content
4801 */
4802 while (((CUR == '<') && (NXT(1) == '!') &&
4803 (NXT(2) == '-') && (NXT(3) == '-')) ||
4804 ((CUR == '<') && (NXT(1) == '?'))) {
4805 htmlParseComment(ctxt);
4806 htmlParsePI(ctxt);
4807 SKIP_BLANKS;
4808 }
4809
4810 /*
4811 * Time to start parsing the tree itself
4812 */
4813 htmlParseContentInternal(ctxt);
4814
4815 /*
4816 * autoclose
4817 */
4818 if (CUR == 0)
4819 htmlAutoCloseOnEnd(ctxt);
4820
4821
4822 /*
4823 * SAX: end of the document processing.
4824 */
4825 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4826 ctxt->sax->endDocument(ctxt->userData);
4827
4828 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4829 dtd = xmlGetIntSubset(ctxt->myDoc);
4830 if (dtd == NULL)
4831 ctxt->myDoc->intSubset =
4832 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4833 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4834 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4835 }
4836 if (! ctxt->wellFormed) return(-1);
4837 return(0);
4838}
4839
4840
4841/************************************************************************
4842 * *
4843 * Parser contexts handling *
4844 * *
4845 ************************************************************************/
4846
4847/**
4848 * htmlInitParserCtxt:
4849 * @ctxt: an HTML parser context
4850 *
4851 * Initialize a parser context
4852 *
4853 * Returns 0 in case of success and -1 in case of error
4854 */
4855
4856static int
4857htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4858{
4859 htmlSAXHandler *sax;
4860
4861 if (ctxt == NULL) return(-1);
4862 memset(ctxt, 0, sizeof(htmlParserCtxt));
4863
4864 ctxt->dict = xmlDictCreate();
4865 if (ctxt->dict == NULL) {
4866 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4867 return(-1);
4868 }
4869 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4870 if (sax == NULL) {
4871 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4872 return(-1);
4873 }
4874 else
4875 memset(sax, 0, sizeof(htmlSAXHandler));
4876
4877 /* Allocate the Input stack */
4878 ctxt->inputTab = (htmlParserInputPtr *)
4879 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4880 if (ctxt->inputTab == NULL) {
4881 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4882 ctxt->inputNr = 0;
4883 ctxt->inputMax = 0;
4884 ctxt->input = NULL;
4885 return(-1);
4886 }
4887 ctxt->inputNr = 0;
4888 ctxt->inputMax = 5;
4889 ctxt->input = NULL;
4890 ctxt->version = NULL;
4891 ctxt->encoding = NULL;
4892 ctxt->standalone = -1;
4893 ctxt->instate = XML_PARSER_START;
4894
4895 /* Allocate the Node stack */
4896 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4897 if (ctxt->nodeTab == NULL) {
4898 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4899 ctxt->nodeNr = 0;
4900 ctxt->nodeMax = 0;
4901 ctxt->node = NULL;
4902 ctxt->inputNr = 0;
4903 ctxt->inputMax = 0;
4904 ctxt->input = NULL;
4905 return(-1);
4906 }
4907 ctxt->nodeNr = 0;
4908 ctxt->nodeMax = 10;
4909 ctxt->node = NULL;
4910
4911 /* Allocate the Name stack */
4912 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4913 if (ctxt->nameTab == NULL) {
4914 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4915 ctxt->nameNr = 0;
4916 ctxt->nameMax = 0;
4917 ctxt->name = NULL;
4918 ctxt->nodeNr = 0;
4919 ctxt->nodeMax = 0;
4920 ctxt->node = NULL;
4921 ctxt->inputNr = 0;
4922 ctxt->inputMax = 0;
4923 ctxt->input = NULL;
4924 return(-1);
4925 }
4926 ctxt->nameNr = 0;
4927 ctxt->nameMax = 10;
4928 ctxt->name = NULL;
4929
4930 ctxt->nodeInfoTab = NULL;
4931 ctxt->nodeInfoNr = 0;
4932 ctxt->nodeInfoMax = 0;
4933
4934 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4935 else {
4936 ctxt->sax = sax;
4937 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4938 }
4939 ctxt->userData = ctxt;
4940 ctxt->myDoc = NULL;
4941 ctxt->wellFormed = 1;
4942 ctxt->replaceEntities = 0;
4943 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4944 ctxt->html = 1;
4945 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4946 ctxt->vctxt.userData = ctxt;
4947 ctxt->vctxt.error = xmlParserValidityError;
4948 ctxt->vctxt.warning = xmlParserValidityWarning;
4949 ctxt->record_info = 0;
4950 ctxt->validate = 0;
4951 ctxt->nbChars = 0;
4952 ctxt->checkIndex = 0;
4953 ctxt->catalogs = NULL;
4954 xmlInitNodeInfoSeq(&ctxt->node_seq);
4955 return(0);
4956}
4957
4958/**
4959 * htmlFreeParserCtxt:
4960 * @ctxt: an HTML parser context
4961 *
4962 * Free all the memory used by a parser context. However the parsed
4963 * document in ctxt->myDoc is not freed.
4964 */
4965
4966void
4967htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4968{
4969 xmlFreeParserCtxt(ctxt);
4970}
4971
4972/**
4973 * htmlNewParserCtxt:
4974 *
4975 * Allocate and initialize a new parser context.
4976 *
4977 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4978 */
4979
4980htmlParserCtxtPtr
4981htmlNewParserCtxt(void)
4982{
4983 xmlParserCtxtPtr ctxt;
4984
4985 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4986 if (ctxt == NULL) {
4987 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4988 return(NULL);
4989 }
4990 memset(ctxt, 0, sizeof(xmlParserCtxt));
4991 if (htmlInitParserCtxt(ctxt) < 0) {
4992 htmlFreeParserCtxt(ctxt);
4993 return(NULL);
4994 }
4995 return(ctxt);
4996}
4997
4998/**
4999 * htmlCreateMemoryParserCtxt:
5000 * @buffer: a pointer to a char array
5001 * @size: the size of the array
5002 *
5003 * Create a parser context for an HTML in-memory document.
5004 *
5005 * Returns the new parser context or NULL
5006 */
5007htmlParserCtxtPtr
5008htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5009 xmlParserCtxtPtr ctxt;
5010 xmlParserInputPtr input;
5011 xmlParserInputBufferPtr buf;
5012
5013 if (buffer == NULL)
5014 return(NULL);
5015 if (size <= 0)
5016 return(NULL);
5017
5018 ctxt = htmlNewParserCtxt();
5019 if (ctxt == NULL)
5020 return(NULL);
5021
5022 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5023 if (buf == NULL) return(NULL);
5024
5025 input = xmlNewInputStream(ctxt);
5026 if (input == NULL) {
5027 xmlFreeParserCtxt(ctxt);
5028 return(NULL);
5029 }
5030
5031 input->filename = NULL;
5032 input->buf = buf;
5033 xmlBufResetInput(buf->buffer, input);
5034
5035 inputPush(ctxt, input);
5036 return(ctxt);
5037}
5038
5039/**
5040 * htmlCreateDocParserCtxt:
5041 * @cur: a pointer to an array of xmlChar
5042 * @encoding: a free form C string describing the HTML document encoding, or NULL
5043 *
5044 * Create a parser context for an HTML document.
5045 *
5046 * TODO: check the need to add encoding handling there
5047 *
5048 * Returns the new parser context or NULL
5049 */
5050static htmlParserCtxtPtr
5051htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5052 int len;
5053 htmlParserCtxtPtr ctxt;
5054
5055 if (cur == NULL)
5056 return(NULL);
5057 len = xmlStrlen(cur);
5058 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5059 if (ctxt == NULL)
5060 return(NULL);
5061
5062 if (encoding != NULL) {
5063 xmlCharEncoding enc;
5064 xmlCharEncodingHandlerPtr handler;
5065
5066 if (ctxt->input->encoding != NULL)
5067 xmlFree((xmlChar *) ctxt->input->encoding);
5068 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5069
5070 enc = xmlParseCharEncoding(encoding);
5071 /*
5072 * registered set of known encodings
5073 */
5074 if (enc != XML_CHAR_ENCODING_ERROR) {
5075 xmlSwitchEncoding(ctxt, enc);
5076 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5077 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5078 "Unsupported encoding %s\n",
5079 (const xmlChar *) encoding, NULL);
5080 }
5081 } else {
5082 /*
5083 * fallback for unknown encodings
5084 */
5085 handler = xmlFindCharEncodingHandler((const char *) encoding);
5086 if (handler != NULL) {
5087 xmlSwitchToEncoding(ctxt, handler);
5088 } else {
5089 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5090 "Unsupported encoding %s\n",
5091 (const xmlChar *) encoding, NULL);
5092 }
5093 }
5094 }
5095 return(ctxt);
5096}
5097
5098#ifdef LIBXML_PUSH_ENABLED
5099/************************************************************************
5100 * *
5101 * Progressive parsing interfaces *
5102 * *
5103 ************************************************************************/
5104
5105/**
5106 * htmlParseLookupSequence:
5107 * @ctxt: an HTML parser context
5108 * @first: the first char to lookup
5109 * @next: the next char to lookup or zero
5110 * @third: the next char to lookup or zero
5111 * @comment: flag to force checking inside comments
5112 *
5113 * Try to find if a sequence (first, next, third) or just (first next) or
5114 * (first) is available in the input stream.
5115 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5116 * to avoid rescanning sequences of bytes, it DOES change the state of the
5117 * parser, do not use liberally.
5118 * This is basically similar to xmlParseLookupSequence()
5119 *
5120 * Returns the index to the current parsing point if the full sequence
5121 * is available, -1 otherwise.
5122 */
5123static int
5124htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5125 xmlChar next, xmlChar third, int iscomment,
5126 int ignoreattrval)
5127{
5128 int base, len;
5129 htmlParserInputPtr in;
5130 const xmlChar *buf;
5131 int incomment = 0;
5132 int invalue = 0;
5133 char valdellim = 0x0;
5134
5135 in = ctxt->input;
5136 if (in == NULL)
5137 return (-1);
5138
5139 base = in->cur - in->base;
5140 if (base < 0)
5141 return (-1);
5142
5143 if (ctxt->checkIndex > base)
5144 base = ctxt->checkIndex;
5145
5146 if (in->buf == NULL) {
5147 buf = in->base;
5148 len = in->length;
5149 } else {
5150 buf = xmlBufContent(in->buf->buffer);
5151 len = xmlBufUse(in->buf->buffer);
5152 }
5153
5154 /* take into account the sequence length */
5155 if (third)
5156 len -= 2;
5157 else if (next)
5158 len--;
5159 for (; base < len; base++) {
5160 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5161 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5162 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5163 incomment = 1;
5164 /* do not increment past <! - some people use <!--> */
5165 base += 2;
5166 }
5167 }
5168 if (ignoreattrval) {
5169 if (buf[base] == '"' || buf[base] == '\'') {
5170 if (invalue) {
5171 if (buf[base] == valdellim) {
5172 invalue = 0;
5173 continue;
5174 }
5175 } else {
5176 valdellim = buf[base];
5177 invalue = 1;
5178 continue;
5179 }
5180 } else if (invalue) {
5181 continue;
5182 }
5183 }
5184 if (incomment) {
5185 if (base + 3 > len)
5186 return (-1);
5187 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5188 (buf[base + 2] == '>')) {
5189 incomment = 0;
5190 base += 2;
5191 }
5192 continue;
5193 }
5194 if (buf[base] == first) {
5195 if (third != 0) {
5196 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5197 continue;
5198 } else if (next != 0) {
5199 if (buf[base + 1] != next)
5200 continue;
5201 }
5202 ctxt->checkIndex = 0;
5203#ifdef DEBUG_PUSH
5204 if (next == 0)
5205 xmlGenericError(xmlGenericErrorContext,
5206 "HPP: lookup '%c' found at %d\n",
5207 first, base);
5208 else if (third == 0)
5209 xmlGenericError(xmlGenericErrorContext,
5210 "HPP: lookup '%c%c' found at %d\n",
5211 first, next, base);
5212 else
5213 xmlGenericError(xmlGenericErrorContext,
5214 "HPP: lookup '%c%c%c' found at %d\n",
5215 first, next, third, base);
5216#endif
5217 return (base - (in->cur - in->base));
5218 }
5219 }
5220 if ((!incomment) && (!invalue))
5221 ctxt->checkIndex = base;
5222#ifdef DEBUG_PUSH
5223 if (next == 0)
5224 xmlGenericError(xmlGenericErrorContext,
5225 "HPP: lookup '%c' failed\n", first);
5226 else if (third == 0)
5227 xmlGenericError(xmlGenericErrorContext,
5228 "HPP: lookup '%c%c' failed\n", first, next);
5229 else
5230 xmlGenericError(xmlGenericErrorContext,
5231 "HPP: lookup '%c%c%c' failed\n", first, next,
5232 third);
5233#endif
5234 return (-1);
5235}
5236
5237/**
5238 * htmlParseLookupChars:
5239 * @ctxt: an HTML parser context
5240 * @stop: Array of chars, which stop the lookup.
5241 * @stopLen: Length of stop-Array
5242 *
5243 * Try to find if any char of the stop-Array is available in the input
5244 * stream.
5245 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5246 * to avoid rescanning sequences of bytes, it DOES change the state of the
5247 * parser, do not use liberally.
5248 *
5249 * Returns the index to the current parsing point if a stopChar
5250 * is available, -1 otherwise.
5251 */
5252static int
5253htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5254 int stopLen)
5255{
5256 int base, len;
5257 htmlParserInputPtr in;
5258 const xmlChar *buf;
5259 int incomment = 0;
5260 int i;
5261
5262 in = ctxt->input;
5263 if (in == NULL)
5264 return (-1);
5265
5266 base = in->cur - in->base;
5267 if (base < 0)
5268 return (-1);
5269
5270 if (ctxt->checkIndex > base)
5271 base = ctxt->checkIndex;
5272
5273 if (in->buf == NULL) {
5274 buf = in->base;
5275 len = in->length;
5276 } else {
5277 buf = xmlBufContent(in->buf->buffer);
5278 len = xmlBufUse(in->buf->buffer);
5279 }
5280
5281 for (; base < len; base++) {
5282 if (!incomment && (base + 4 < len)) {
5283 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5284 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5285 incomment = 1;
5286 /* do not increment past <! - some people use <!--> */
5287 base += 2;
5288 }
5289 }
5290 if (incomment) {
5291 if (base + 3 > len)
5292 return (-1);
5293 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5294 (buf[base + 2] == '>')) {
5295 incomment = 0;
5296 base += 2;
5297 }
5298 continue;
5299 }
5300 for (i = 0; i < stopLen; ++i) {
5301 if (buf[base] == stop[i]) {
5302 ctxt->checkIndex = 0;
5303 return (base - (in->cur - in->base));
5304 }
5305 }
5306 }
5307 ctxt->checkIndex = base;
5308 return (-1);
5309}
5310
5311/**
5312 * htmlParseTryOrFinish:
5313 * @ctxt: an HTML parser context
5314 * @terminate: last chunk indicator
5315 *
5316 * Try to progress on parsing
5317 *
5318 * Returns zero if no parsing was possible
5319 */
5320static int
5321htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5322 int ret = 0;
5323 htmlParserInputPtr in;
5324 int avail = 0;
5325 xmlChar cur, next;
5326
5327 htmlParserNodeInfo node_info;
5328
5329#ifdef DEBUG_PUSH
5330 switch (ctxt->instate) {
5331 case XML_PARSER_EOF:
5332 xmlGenericError(xmlGenericErrorContext,
5333 "HPP: try EOF\n"); break;
5334 case XML_PARSER_START:
5335 xmlGenericError(xmlGenericErrorContext,
5336 "HPP: try START\n"); break;
5337 case XML_PARSER_MISC:
5338 xmlGenericError(xmlGenericErrorContext,
5339 "HPP: try MISC\n");break;
5340 case XML_PARSER_COMMENT:
5341 xmlGenericError(xmlGenericErrorContext,
5342 "HPP: try COMMENT\n");break;
5343 case XML_PARSER_PROLOG:
5344 xmlGenericError(xmlGenericErrorContext,
5345 "HPP: try PROLOG\n");break;
5346 case XML_PARSER_START_TAG:
5347 xmlGenericError(xmlGenericErrorContext,
5348 "HPP: try START_TAG\n");break;
5349 case XML_PARSER_CONTENT:
5350 xmlGenericError(xmlGenericErrorContext,
5351 "HPP: try CONTENT\n");break;
5352 case XML_PARSER_CDATA_SECTION:
5353 xmlGenericError(xmlGenericErrorContext,
5354 "HPP: try CDATA_SECTION\n");break;
5355 case XML_PARSER_END_TAG:
5356 xmlGenericError(xmlGenericErrorContext,
5357 "HPP: try END_TAG\n");break;
5358 case XML_PARSER_ENTITY_DECL:
5359 xmlGenericError(xmlGenericErrorContext,
5360 "HPP: try ENTITY_DECL\n");break;
5361 case XML_PARSER_ENTITY_VALUE:
5362 xmlGenericError(xmlGenericErrorContext,
5363 "HPP: try ENTITY_VALUE\n");break;
5364 case XML_PARSER_ATTRIBUTE_VALUE:
5365 xmlGenericError(xmlGenericErrorContext,
5366 "HPP: try ATTRIBUTE_VALUE\n");break;
5367 case XML_PARSER_DTD:
5368 xmlGenericError(xmlGenericErrorContext,
5369 "HPP: try DTD\n");break;
5370 case XML_PARSER_EPILOG:
5371 xmlGenericError(xmlGenericErrorContext,
5372 "HPP: try EPILOG\n");break;
5373 case XML_PARSER_PI:
5374 xmlGenericError(xmlGenericErrorContext,
5375 "HPP: try PI\n");break;
5376 case XML_PARSER_SYSTEM_LITERAL:
5377 xmlGenericError(xmlGenericErrorContext,
5378 "HPP: try SYSTEM_LITERAL\n");break;
5379 }
5380#endif
5381
5382 while (1) {
5383
5384 in = ctxt->input;
5385 if (in == NULL) break;
5386 if (in->buf == NULL)
5387 avail = in->length - (in->cur - in->base);
5388 else
5389 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5390 if ((avail == 0) && (terminate)) {
5391 htmlAutoCloseOnEnd(ctxt);
5392 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5393 /*
5394 * SAX: end of the document processing.
5395 */
5396 ctxt->instate = XML_PARSER_EOF;
5397 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5398 ctxt->sax->endDocument(ctxt->userData);
5399 }
5400 }
5401 if (avail < 1)
5402 goto done;
5403 cur = in->cur[0];
5404 if (cur == 0) {
5405 SKIP(1);
5406 continue;
5407 }
5408
5409 switch (ctxt->instate) {
5410 case XML_PARSER_EOF:
5411 /*
5412 * Document parsing is done !
5413 */
5414 goto done;
5415 case XML_PARSER_START:
5416 /*
5417 * Very first chars read from the document flow.
5418 */
5419 cur = in->cur[0];
5420 if (IS_BLANK_CH(cur)) {
5421 SKIP_BLANKS;
5422 if (in->buf == NULL)
5423 avail = in->length - (in->cur - in->base);
5424 else
5425 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5426 }
5427 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5428 ctxt->sax->setDocumentLocator(ctxt->userData,
5429 &xmlDefaultSAXLocator);
5430 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5431 (!ctxt->disableSAX))
5432 ctxt->sax->startDocument(ctxt->userData);
5433
5434 cur = in->cur[0];
5435 next = in->cur[1];
5436 if ((cur == '<') && (next == '!') &&
5437 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5438 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5439 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5440 (UPP(8) == 'E')) {
5441 if ((!terminate) &&
5442 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5443 goto done;
5444#ifdef DEBUG_PUSH
5445 xmlGenericError(xmlGenericErrorContext,
5446 "HPP: Parsing internal subset\n");
5447#endif
5448 htmlParseDocTypeDecl(ctxt);
5449 ctxt->instate = XML_PARSER_PROLOG;
5450#ifdef DEBUG_PUSH
5451 xmlGenericError(xmlGenericErrorContext,
5452 "HPP: entering PROLOG\n");
5453#endif
5454 } else {
5455 ctxt->instate = XML_PARSER_MISC;
5456#ifdef DEBUG_PUSH
5457 xmlGenericError(xmlGenericErrorContext,
5458 "HPP: entering MISC\n");
5459#endif
5460 }
5461 break;
5462 case XML_PARSER_MISC:
5463 SKIP_BLANKS;
5464 if (in->buf == NULL)
5465 avail = in->length - (in->cur - in->base);
5466 else
5467 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5468 /*
5469 * no chars in buffer
5470 */
5471 if (avail < 1)
5472 goto done;
5473 /*
5474 * not enouth chars in buffer
5475 */
5476 if (avail < 2) {
5477 if (!terminate)
5478 goto done;
5479 else
5480 next = ' ';
5481 } else {
5482 next = in->cur[1];
5483 }
5484 cur = in->cur[0];
5485 if ((cur == '<') && (next == '!') &&
5486 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5487 if ((!terminate) &&
5488 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5489 goto done;
5490#ifdef DEBUG_PUSH
5491 xmlGenericError(xmlGenericErrorContext,
5492 "HPP: Parsing Comment\n");
5493#endif
5494 htmlParseComment(ctxt);
5495 ctxt->instate = XML_PARSER_MISC;
5496 } else if ((cur == '<') && (next == '?')) {
5497 if ((!terminate) &&
5498 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5499 goto done;
5500#ifdef DEBUG_PUSH
5501 xmlGenericError(xmlGenericErrorContext,
5502 "HPP: Parsing PI\n");
5503#endif
5504 htmlParsePI(ctxt);
5505 ctxt->instate = XML_PARSER_MISC;
5506 } else if ((cur == '<') && (next == '!') &&
5507 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5508 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5509 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5510 (UPP(8) == 'E')) {
5511 if ((!terminate) &&
5512 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5513 goto done;
5514#ifdef DEBUG_PUSH
5515 xmlGenericError(xmlGenericErrorContext,
5516 "HPP: Parsing internal subset\n");
5517#endif
5518 htmlParseDocTypeDecl(ctxt);
5519 ctxt->instate = XML_PARSER_PROLOG;
5520#ifdef DEBUG_PUSH
5521 xmlGenericError(xmlGenericErrorContext,
5522 "HPP: entering PROLOG\n");
5523#endif
5524 } else if ((cur == '<') && (next == '!') &&
5525 (avail < 9)) {
5526 goto done;
5527 } else {
5528 ctxt->instate = XML_PARSER_START_TAG;
5529#ifdef DEBUG_PUSH
5530 xmlGenericError(xmlGenericErrorContext,
5531 "HPP: entering START_TAG\n");
5532#endif
5533 }
5534 break;
5535 case XML_PARSER_PROLOG:
5536 SKIP_BLANKS;
5537 if (in->buf == NULL)
5538 avail = in->length - (in->cur - in->base);
5539 else
5540 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5541 if (avail < 2)
5542 goto done;
5543 cur = in->cur[0];
5544 next = in->cur[1];
5545 if ((cur == '<') && (next == '!') &&
5546 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5547 if ((!terminate) &&
5548 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5549 goto done;
5550#ifdef DEBUG_PUSH
5551 xmlGenericError(xmlGenericErrorContext,
5552 "HPP: Parsing Comment\n");
5553#endif
5554 htmlParseComment(ctxt);
5555 ctxt->instate = XML_PARSER_PROLOG;
5556 } else if ((cur == '<') && (next == '?')) {
5557 if ((!terminate) &&
5558 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5559 goto done;
5560#ifdef DEBUG_PUSH
5561 xmlGenericError(xmlGenericErrorContext,
5562 "HPP: Parsing PI\n");
5563#endif
5564 htmlParsePI(ctxt);
5565 ctxt->instate = XML_PARSER_PROLOG;
5566 } else if ((cur == '<') && (next == '!') &&
5567 (avail < 4)) {
5568 goto done;
5569 } else {
5570 ctxt->instate = XML_PARSER_START_TAG;
5571#ifdef DEBUG_PUSH
5572 xmlGenericError(xmlGenericErrorContext,
5573 "HPP: entering START_TAG\n");
5574#endif
5575 }
5576 break;
5577 case XML_PARSER_EPILOG:
5578 if (in->buf == NULL)
5579 avail = in->length - (in->cur - in->base);
5580 else
5581 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5582 if (avail < 1)
5583 goto done;
5584 cur = in->cur[0];
5585 if (IS_BLANK_CH(cur)) {
5586 htmlParseCharData(ctxt);
5587 goto done;
5588 }
5589 if (avail < 2)
5590 goto done;
5591 next = in->cur[1];
5592 if ((cur == '<') && (next == '!') &&
5593 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5594 if ((!terminate) &&
5595 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5596 goto done;
5597#ifdef DEBUG_PUSH
5598 xmlGenericError(xmlGenericErrorContext,
5599 "HPP: Parsing Comment\n");
5600#endif
5601 htmlParseComment(ctxt);
5602 ctxt->instate = XML_PARSER_EPILOG;
5603 } else if ((cur == '<') && (next == '?')) {
5604 if ((!terminate) &&
5605 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5606 goto done;
5607#ifdef DEBUG_PUSH
5608 xmlGenericError(xmlGenericErrorContext,
5609 "HPP: Parsing PI\n");
5610#endif
5611 htmlParsePI(ctxt);
5612 ctxt->instate = XML_PARSER_EPILOG;
5613 } else if ((cur == '<') && (next == '!') &&
5614 (avail < 4)) {
5615 goto done;
5616 } else {
5617 ctxt->errNo = XML_ERR_DOCUMENT_END;
5618 ctxt->wellFormed = 0;
5619 ctxt->instate = XML_PARSER_EOF;
5620#ifdef DEBUG_PUSH
5621 xmlGenericError(xmlGenericErrorContext,
5622 "HPP: entering EOF\n");
5623#endif
5624 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5625 ctxt->sax->endDocument(ctxt->userData);
5626 goto done;
5627 }
5628 break;
5629 case XML_PARSER_START_TAG: {
5630 const xmlChar *name;
5631 int failed;
5632 const htmlElemDesc * info;
5633
5634 /*
5635 * no chars in buffer
5636 */
5637 if (avail < 1)
5638 goto done;
5639 /*
5640 * not enouth chars in buffer
5641 */
5642 if (avail < 2) {
5643 if (!terminate)
5644 goto done;
5645 else
5646 next = ' ';
5647 } else {
5648 next = in->cur[1];
5649 }
5650 cur = in->cur[0];
5651 if (cur != '<') {
5652 ctxt->instate = XML_PARSER_CONTENT;
5653#ifdef DEBUG_PUSH
5654 xmlGenericError(xmlGenericErrorContext,
5655 "HPP: entering CONTENT\n");
5656#endif
5657 break;
5658 }
5659 if (next == '/') {
5660 ctxt->instate = XML_PARSER_END_TAG;
5661 ctxt->checkIndex = 0;
5662#ifdef DEBUG_PUSH
5663 xmlGenericError(xmlGenericErrorContext,
5664 "HPP: entering END_TAG\n");
5665#endif
5666 break;
5667 }
5668 if ((!terminate) &&
5669 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5670 goto done;
5671
5672 /* Capture start position */
5673 if (ctxt->record_info) {
5674 node_info.begin_pos = ctxt->input->consumed +
5675 (CUR_PTR - ctxt->input->base);
5676 node_info.begin_line = ctxt->input->line;
5677 }
5678
5679
5680 failed = htmlParseStartTag(ctxt);
5681 name = ctxt->name;
5682 if ((failed == -1) ||
5683 (name == NULL)) {
5684 if (CUR == '>')
5685 NEXT;
5686 break;
5687 }
5688
5689 /*
5690 * Lookup the info for that element.
5691 */
5692 info = htmlTagLookup(name);
5693 if (info == NULL) {
5694 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5695 "Tag %s invalid\n", name, NULL);
5696 }
5697
5698 /*
5699 * Check for an Empty Element labeled the XML/SGML way
5700 */
5701 if ((CUR == '/') && (NXT(1) == '>')) {
5702 SKIP(2);
5703 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5704 ctxt->sax->endElement(ctxt->userData, name);
5705 htmlnamePop(ctxt);
5706 ctxt->instate = XML_PARSER_CONTENT;
5707#ifdef DEBUG_PUSH
5708 xmlGenericError(xmlGenericErrorContext,
5709 "HPP: entering CONTENT\n");
5710#endif
5711 break;
5712 }
5713
5714 if (CUR == '>') {
5715 NEXT;
5716 } else {
5717 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5718 "Couldn't find end of Start Tag %s\n",
5719 name, NULL);
5720
5721 /*
5722 * end of parsing of this node.
5723 */
5724 if (xmlStrEqual(name, ctxt->name)) {
5725 nodePop(ctxt);
5726 htmlnamePop(ctxt);
5727 }
5728
5729 if (ctxt->record_info)
5730 htmlNodeInfoPush(ctxt, &node_info);
5731
5732 ctxt->instate = XML_PARSER_CONTENT;
5733#ifdef DEBUG_PUSH
5734 xmlGenericError(xmlGenericErrorContext,
5735 "HPP: entering CONTENT\n");
5736#endif
5737 break;
5738 }
5739
5740 /*
5741 * Check for an Empty Element from DTD definition
5742 */
5743 if ((info != NULL) && (info->empty)) {
5744 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5745 ctxt->sax->endElement(ctxt->userData, name);
5746 htmlnamePop(ctxt);
5747 }
5748
5749 if (ctxt->record_info)
5750 htmlNodeInfoPush(ctxt, &node_info);
5751
5752 ctxt->instate = XML_PARSER_CONTENT;
5753#ifdef DEBUG_PUSH
5754 xmlGenericError(xmlGenericErrorContext,
5755 "HPP: entering CONTENT\n");
5756#endif
5757 break;
5758 }
5759 case XML_PARSER_CONTENT: {
5760 long cons;
5761 /*
5762 * Handle preparsed entities and charRef
5763 */
5764 if (ctxt->token != 0) {
5765 xmlChar chr[2] = { 0 , 0 } ;
5766
5767 chr[0] = (xmlChar) ctxt->token;
5768 htmlCheckParagraph(ctxt);
5769 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5770 ctxt->sax->characters(ctxt->userData, chr, 1);
5771 ctxt->token = 0;
5772 ctxt->checkIndex = 0;
5773 }
5774 if ((avail == 1) && (terminate)) {
5775 cur = in->cur[0];
5776 if ((cur != '<') && (cur != '&')) {
5777 if (ctxt->sax != NULL) {
5778 if (IS_BLANK_CH(cur)) {
5779 if (ctxt->keepBlanks) {
5780 if (ctxt->sax->characters != NULL)
5781 ctxt->sax->characters(
5782 ctxt->userData, &in->cur[0], 1);
5783 } else {
5784 if (ctxt->sax->ignorableWhitespace != NULL)
5785 ctxt->sax->ignorableWhitespace(
5786 ctxt->userData, &in->cur[0], 1);
5787 }
5788 } else {
5789 htmlCheckParagraph(ctxt);
5790 if (ctxt->sax->characters != NULL)
5791 ctxt->sax->characters(
5792 ctxt->userData, &in->cur[0], 1);
5793 }
5794 }
5795 ctxt->token = 0;
5796 ctxt->checkIndex = 0;
5797 in->cur++;
5798 break;
5799 }
5800 }
5801 if (avail < 2)
5802 goto done;
5803 cur = in->cur[0];
5804 next = in->cur[1];
5805 cons = ctxt->nbChars;
5806 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5807 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5808 /*
5809 * Handle SCRIPT/STYLE separately
5810 */
5811 if (!terminate) {
5812 int idx;
5813 xmlChar val;
5814
5815 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
5816 if (idx < 0)
5817 goto done;
5818 val = in->cur[idx + 2];
5819 if (val == 0) /* bad cut of input */
5820 goto done;
5821 }
5822 htmlParseScript(ctxt);
5823 if ((cur == '<') && (next == '/')) {
5824 ctxt->instate = XML_PARSER_END_TAG;
5825 ctxt->checkIndex = 0;
5826#ifdef DEBUG_PUSH
5827 xmlGenericError(xmlGenericErrorContext,
5828 "HPP: entering END_TAG\n");
5829#endif
5830 break;
5831 }
5832 } else {
5833 /*
5834 * Sometimes DOCTYPE arrives in the middle of the document
5835 */
5836 if ((cur == '<') && (next == '!') &&
5837 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5838 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5839 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5840 (UPP(8) == 'E')) {
5841 if ((!terminate) &&
5842 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5843 goto done;
5844 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5845 "Misplaced DOCTYPE declaration\n",
5846 BAD_CAST "DOCTYPE" , NULL);
5847 htmlParseDocTypeDecl(ctxt);
5848 } else if ((cur == '<') && (next == '!') &&
5849 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5850 if ((!terminate) &&
5851 (htmlParseLookupSequence(
5852 ctxt, '-', '-', '>', 1, 1) < 0))
5853 goto done;
5854#ifdef DEBUG_PUSH
5855 xmlGenericError(xmlGenericErrorContext,
5856 "HPP: Parsing Comment\n");
5857#endif
5858 htmlParseComment(ctxt);
5859 ctxt->instate = XML_PARSER_CONTENT;
5860 } else if ((cur == '<') && (next == '?')) {
5861 if ((!terminate) &&
5862 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5863 goto done;
5864#ifdef DEBUG_PUSH
5865 xmlGenericError(xmlGenericErrorContext,
5866 "HPP: Parsing PI\n");
5867#endif
5868 htmlParsePI(ctxt);
5869 ctxt->instate = XML_PARSER_CONTENT;
5870 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5871 goto done;
5872 } else if ((cur == '<') && (next == '/')) {
5873 ctxt->instate = XML_PARSER_END_TAG;
5874 ctxt->checkIndex = 0;
5875#ifdef DEBUG_PUSH
5876 xmlGenericError(xmlGenericErrorContext,
5877 "HPP: entering END_TAG\n");
5878#endif
5879 break;
5880 } else if (cur == '<') {
5881 ctxt->instate = XML_PARSER_START_TAG;
5882 ctxt->checkIndex = 0;
5883#ifdef DEBUG_PUSH
5884 xmlGenericError(xmlGenericErrorContext,
5885 "HPP: entering START_TAG\n");
5886#endif
5887 break;
5888 } else if (cur == '&') {
5889 if ((!terminate) &&
5890 (htmlParseLookupChars(ctxt,
5891 BAD_CAST "; >/", 4) < 0))
5892 goto done;
5893#ifdef DEBUG_PUSH
5894 xmlGenericError(xmlGenericErrorContext,
5895 "HPP: Parsing Reference\n");
5896#endif
5897 /* TODO: check generation of subtrees if noent !!! */
5898 htmlParseReference(ctxt);
5899 } else {
5900 /*
5901 * check that the text sequence is complete
5902 * before handing out the data to the parser
5903 * to avoid problems with erroneous end of
5904 * data detection.
5905 */
5906 if ((!terminate) &&
5907 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5908 goto done;
5909 ctxt->checkIndex = 0;
5910#ifdef DEBUG_PUSH
5911 xmlGenericError(xmlGenericErrorContext,
5912 "HPP: Parsing char data\n");
5913#endif
5914 htmlParseCharData(ctxt);
5915 }
5916 }
5917 if (cons == ctxt->nbChars) {
5918 if (ctxt->node != NULL) {
5919 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5920 "detected an error in element content\n",
5921 NULL, NULL);
5922 }
5923 NEXT;
5924 break;
5925 }
5926
5927 break;
5928 }
5929 case XML_PARSER_END_TAG:
5930 if (avail < 2)
5931 goto done;
5932 if ((!terminate) &&
5933 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5934 goto done;
5935 htmlParseEndTag(ctxt);
5936 if (ctxt->nameNr == 0) {
5937 ctxt->instate = XML_PARSER_EPILOG;
5938 } else {
5939 ctxt->instate = XML_PARSER_CONTENT;
5940 }
5941 ctxt->checkIndex = 0;
5942#ifdef DEBUG_PUSH
5943 xmlGenericError(xmlGenericErrorContext,
5944 "HPP: entering CONTENT\n");
5945#endif
5946 break;
5947 case XML_PARSER_CDATA_SECTION:
5948 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5949 "HPP: internal error, state == CDATA\n",
5950 NULL, NULL);
5951 ctxt->instate = XML_PARSER_CONTENT;
5952 ctxt->checkIndex = 0;
5953#ifdef DEBUG_PUSH
5954 xmlGenericError(xmlGenericErrorContext,
5955 "HPP: entering CONTENT\n");
5956#endif
5957 break;
5958 case XML_PARSER_DTD:
5959 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5960 "HPP: internal error, state == DTD\n",
5961 NULL, NULL);
5962 ctxt->instate = XML_PARSER_CONTENT;
5963 ctxt->checkIndex = 0;
5964#ifdef DEBUG_PUSH
5965 xmlGenericError(xmlGenericErrorContext,
5966 "HPP: entering CONTENT\n");
5967#endif
5968 break;
5969 case XML_PARSER_COMMENT:
5970 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5971 "HPP: internal error, state == COMMENT\n",
5972 NULL, NULL);
5973 ctxt->instate = XML_PARSER_CONTENT;
5974 ctxt->checkIndex = 0;
5975#ifdef DEBUG_PUSH
5976 xmlGenericError(xmlGenericErrorContext,
5977 "HPP: entering CONTENT\n");
5978#endif
5979 break;
5980 case XML_PARSER_PI:
5981 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5982 "HPP: internal error, state == PI\n",
5983 NULL, NULL);
5984 ctxt->instate = XML_PARSER_CONTENT;
5985 ctxt->checkIndex = 0;
5986#ifdef DEBUG_PUSH
5987 xmlGenericError(xmlGenericErrorContext,
5988 "HPP: entering CONTENT\n");
5989#endif
5990 break;
5991 case XML_PARSER_ENTITY_DECL:
5992 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5993 "HPP: internal error, state == ENTITY_DECL\n",
5994 NULL, NULL);
5995 ctxt->instate = XML_PARSER_CONTENT;
5996 ctxt->checkIndex = 0;
5997#ifdef DEBUG_PUSH
5998 xmlGenericError(xmlGenericErrorContext,
5999 "HPP: entering CONTENT\n");
6000#endif
6001 break;
6002 case XML_PARSER_ENTITY_VALUE:
6003 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6004 "HPP: internal error, state == ENTITY_VALUE\n",
6005 NULL, NULL);
6006 ctxt->instate = XML_PARSER_CONTENT;
6007 ctxt->checkIndex = 0;
6008#ifdef DEBUG_PUSH
6009 xmlGenericError(xmlGenericErrorContext,
6010 "HPP: entering DTD\n");
6011#endif
6012 break;
6013 case XML_PARSER_ATTRIBUTE_VALUE:
6014 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6015 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6016 NULL, NULL);
6017 ctxt->instate = XML_PARSER_START_TAG;
6018 ctxt->checkIndex = 0;
6019#ifdef DEBUG_PUSH
6020 xmlGenericError(xmlGenericErrorContext,
6021 "HPP: entering START_TAG\n");
6022#endif
6023 break;
6024 case XML_PARSER_SYSTEM_LITERAL:
6025 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6026 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6027 NULL, NULL);
6028 ctxt->instate = XML_PARSER_CONTENT;
6029 ctxt->checkIndex = 0;
6030#ifdef DEBUG_PUSH
6031 xmlGenericError(xmlGenericErrorContext,
6032 "HPP: entering CONTENT\n");
6033#endif
6034 break;
6035 case XML_PARSER_IGNORE:
6036 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6037 "HPP: internal error, state == XML_PARSER_IGNORE\n",
6038 NULL, NULL);
6039 ctxt->instate = XML_PARSER_CONTENT;
6040 ctxt->checkIndex = 0;
6041#ifdef DEBUG_PUSH
6042 xmlGenericError(xmlGenericErrorContext,
6043 "HPP: entering CONTENT\n");
6044#endif
6045 break;
6046 case XML_PARSER_PUBLIC_LITERAL:
6047 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6048 "HPP: internal error, state == XML_PARSER_LITERAL\n",
6049 NULL, NULL);
6050 ctxt->instate = XML_PARSER_CONTENT;
6051 ctxt->checkIndex = 0;
6052#ifdef DEBUG_PUSH
6053 xmlGenericError(xmlGenericErrorContext,
6054 "HPP: entering CONTENT\n");
6055#endif
6056 break;
6057
6058 }
6059 }
6060done:
6061 if ((avail == 0) && (terminate)) {
6062 htmlAutoCloseOnEnd(ctxt);
6063 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6064 /*
6065 * SAX: end of the document processing.
6066 */
6067 ctxt->instate = XML_PARSER_EOF;
6068 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6069 ctxt->sax->endDocument(ctxt->userData);
6070 }
6071 }
6072 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6073 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6074 (ctxt->instate == XML_PARSER_EPILOG))) {
6075 xmlDtdPtr dtd;
6076 dtd = xmlGetIntSubset(ctxt->myDoc);
6077 if (dtd == NULL)
6078 ctxt->myDoc->intSubset =
6079 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6080 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6081 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6082 }
6083#ifdef DEBUG_PUSH
6084 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6085#endif
6086 return(ret);
6087}
6088
6089/**
6090 * htmlParseChunk:
6091 * @ctxt: an HTML parser context
6092 * @chunk: an char array
6093 * @size: the size in byte of the chunk
6094 * @terminate: last chunk indicator
6095 *
6096 * Parse a Chunk of memory
6097 *
6098 * Returns zero if no error, the xmlParserErrors otherwise.
6099 */
6100int
6101htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6102 int terminate) {
6103 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6104 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6105 "htmlParseChunk: context error\n", NULL, NULL);
6106 return(XML_ERR_INTERNAL_ERROR);
6107 }
6108 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6109 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
6110 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6111 size_t cur = ctxt->input->cur - ctxt->input->base;
6112 int res;
6113
6114 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6115 if (res < 0) {
6116 ctxt->errNo = XML_PARSER_EOF;
6117 ctxt->disableSAX = 1;
6118 return (XML_PARSER_EOF);
6119 }
6120 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6121#ifdef DEBUG_PUSH
6122 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6123#endif
6124
6125#if 0
6126 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6127 htmlParseTryOrFinish(ctxt, terminate);
6128#endif
6129 } else if (ctxt->instate != XML_PARSER_EOF) {
6130 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6131 xmlParserInputBufferPtr in = ctxt->input->buf;
6132 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6133 (in->raw != NULL)) {
6134 int nbchars;
6135 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6136 size_t current = ctxt->input->cur - ctxt->input->base;
6137
6138 nbchars = xmlCharEncInput(in, terminate);
6139 if (nbchars < 0) {
6140 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6141 "encoder error\n", NULL, NULL);
6142 return(XML_ERR_INVALID_ENCODING);
6143 }
6144 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6145 }
6146 }
6147 }
6148 htmlParseTryOrFinish(ctxt, terminate);
6149 if (terminate) {
6150 if ((ctxt->instate != XML_PARSER_EOF) &&
6151 (ctxt->instate != XML_PARSER_EPILOG) &&
6152 (ctxt->instate != XML_PARSER_MISC)) {
6153 ctxt->errNo = XML_ERR_DOCUMENT_END;
6154 ctxt->wellFormed = 0;
6155 }
6156 if (ctxt->instate != XML_PARSER_EOF) {
6157 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6158 ctxt->sax->endDocument(ctxt->userData);
6159 }
6160 ctxt->instate = XML_PARSER_EOF;
6161 }
6162 return((xmlParserErrors) ctxt->errNo);
6163}
6164
6165/************************************************************************
6166 * *
6167 * User entry points *
6168 * *
6169 ************************************************************************/
6170
6171/**
6172 * htmlCreatePushParserCtxt:
6173 * @sax: a SAX handler
6174 * @user_data: The user data returned on SAX callbacks
6175 * @chunk: a pointer to an array of chars
6176 * @size: number of chars in the array
6177 * @filename: an optional file name or URI
6178 * @enc: an optional encoding
6179 *
6180 * Create a parser context for using the HTML parser in push mode
6181 * The value of @filename is used for fetching external entities
6182 * and error/warning reports.
6183 *
6184 * Returns the new parser context or NULL
6185 */
6186htmlParserCtxtPtr
6187htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6188 const char *chunk, int size, const char *filename,
6189 xmlCharEncoding enc) {
6190 htmlParserCtxtPtr ctxt;
6191 htmlParserInputPtr inputStream;
6192 xmlParserInputBufferPtr buf;
6193
6194 xmlInitParser();
6195
6196 buf = xmlAllocParserInputBuffer(enc);
6197 if (buf == NULL) return(NULL);
6198
6199 ctxt = htmlNewParserCtxt();
6200 if (ctxt == NULL) {
6201 xmlFreeParserInputBuffer(buf);
6202 return(NULL);
6203 }
6204 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6205 ctxt->charset=XML_CHAR_ENCODING_UTF8;
6206 if (sax != NULL) {
6207 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6208 xmlFree(ctxt->sax);
6209 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6210 if (ctxt->sax == NULL) {
6211 xmlFree(buf);
6212 xmlFree(ctxt);
6213 return(NULL);
6214 }
6215 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6216 if (user_data != NULL)
6217 ctxt->userData = user_data;
6218 }
6219 if (filename == NULL) {
6220 ctxt->directory = NULL;
6221 } else {
6222 ctxt->directory = xmlParserGetDirectory(filename);
6223 }
6224
6225 inputStream = htmlNewInputStream(ctxt);
6226 if (inputStream == NULL) {
6227 xmlFreeParserCtxt(ctxt);
6228 xmlFree(buf);
6229 return(NULL);
6230 }
6231
6232 if (filename == NULL)
6233 inputStream->filename = NULL;
6234 else
6235 inputStream->filename = (char *)
6236 xmlCanonicPath((const xmlChar *) filename);
6237 inputStream->buf = buf;
6238 xmlBufResetInput(buf->buffer, inputStream);
6239
6240 inputPush(ctxt, inputStream);
6241
6242 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6243 (ctxt->input->buf != NULL)) {
6244 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6245 size_t cur = ctxt->input->cur - ctxt->input->base;
6246
6247 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6248
6249 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6250#ifdef DEBUG_PUSH
6251 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6252#endif
6253 }
6254 ctxt->progressive = 1;
6255
6256 return(ctxt);
6257}
6258#endif /* LIBXML_PUSH_ENABLED */
6259
6260/**
6261 * htmlSAXParseDoc:
6262 * @cur: a pointer to an array of xmlChar
6263 * @encoding: a free form C string describing the HTML document encoding, or NULL
6264 * @sax: the SAX handler block
6265 * @userData: if using SAX, this pointer will be provided on callbacks.
6266 *
6267 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6268 * to handle parse events. If sax is NULL, fallback to the default DOM
6269 * behavior and return a tree.
6270 *
6271 * Returns the resulting document tree unless SAX is NULL or the document is
6272 * not well formed.
6273 */
6274
6275htmlDocPtr
6276htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
6277 htmlDocPtr ret;
6278 htmlParserCtxtPtr ctxt;
6279
6280 xmlInitParser();
6281
6282 if (cur == NULL) return(NULL);
6283
6284
6285 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6286 if (ctxt == NULL) return(NULL);
6287 if (sax != NULL) {
6288 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6289 ctxt->sax = sax;
6290 ctxt->userData = userData;
6291 }
6292
6293 htmlParseDocument(ctxt);
6294 ret = ctxt->myDoc;
6295 if (sax != NULL) {
6296 ctxt->sax = NULL;
6297 ctxt->userData = NULL;
6298 }
6299 htmlFreeParserCtxt(ctxt);
6300
6301 return(ret);
6302}
6303
6304/**
6305 * htmlParseDoc:
6306 * @cur: a pointer to an array of xmlChar
6307 * @encoding: a free form C string describing the HTML document encoding, or NULL
6308 *
6309 * parse an HTML in-memory document and build a tree.
6310 *
6311 * Returns the resulting document tree
6312 */
6313
6314htmlDocPtr
6315htmlParseDoc(xmlChar *cur, const char *encoding) {
6316 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6317}
6318
6319
6320/**
6321 * htmlCreateFileParserCtxt:
6322 * @filename: the filename
6323 * @encoding: a free form C string describing the HTML document encoding, or NULL
6324 *
6325 * Create a parser context for a file content.
6326 * Automatic support for ZLIB/Compress compressed document is provided
6327 * by default if found at compile-time.
6328 *
6329 * Returns the new parser context or NULL
6330 */
6331htmlParserCtxtPtr
6332htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6333{
6334 htmlParserCtxtPtr ctxt;
6335 htmlParserInputPtr inputStream;
6336 char *canonicFilename;
6337 /* htmlCharEncoding enc; */
6338 xmlChar *content, *content_line = (xmlChar *) "charset=";
6339
6340 if (filename == NULL)
6341 return(NULL);
6342
6343 ctxt = htmlNewParserCtxt();
6344 if (ctxt == NULL) {
6345 return(NULL);
6346 }
6347 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6348 if (canonicFilename == NULL) {
6349#ifdef LIBXML_SAX1_ENABLED
6350 if (xmlDefaultSAXHandler.error != NULL) {
6351 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6352 }
6353#endif
6354 xmlFreeParserCtxt(ctxt);
6355 return(NULL);
6356 }
6357
6358 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6359 xmlFree(canonicFilename);
6360 if (inputStream == NULL) {
6361 xmlFreeParserCtxt(ctxt);
6362 return(NULL);
6363 }
6364
6365 inputPush(ctxt, inputStream);
6366
6367 /* set encoding */
6368 if (encoding) {
6369 size_t l = strlen(encoding);
6370
6371 if (l < 1000) {
6372 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6373 if (content) {
6374 strcpy ((char *)content, (char *)content_line);
6375 strcat ((char *)content, (char *)encoding);
6376 htmlCheckEncoding (ctxt, content);
6377 xmlFree (content);
6378 }
6379 }
6380 }
6381
6382 return(ctxt);
6383}
6384
6385/**
6386 * htmlSAXParseFile:
6387 * @filename: the filename
6388 * @encoding: a free form C string describing the HTML document encoding, or NULL
6389 * @sax: the SAX handler block
6390 * @userData: if using SAX, this pointer will be provided on callbacks.
6391 *
6392 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6393 * compressed document is provided by default if found at compile-time.
6394 * It use the given SAX function block to handle the parsing callback.
6395 * If sax is NULL, fallback to the default DOM tree building routines.
6396 *
6397 * Returns the resulting document tree unless SAX is NULL or the document is
6398 * not well formed.
6399 */
6400
6401htmlDocPtr
6402htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6403 void *userData) {
6404 htmlDocPtr ret;
6405 htmlParserCtxtPtr ctxt;
6406 htmlSAXHandlerPtr oldsax = NULL;
6407
6408 xmlInitParser();
6409
6410 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6411 if (ctxt == NULL) return(NULL);
6412 if (sax != NULL) {
6413 oldsax = ctxt->sax;
6414 ctxt->sax = sax;
6415 ctxt->userData = userData;
6416 }
6417
6418 htmlParseDocument(ctxt);
6419
6420 ret = ctxt->myDoc;
6421 if (sax != NULL) {
6422 ctxt->sax = oldsax;
6423 ctxt->userData = NULL;
6424 }
6425 htmlFreeParserCtxt(ctxt);
6426
6427 return(ret);
6428}
6429
6430/**
6431 * htmlParseFile:
6432 * @filename: the filename
6433 * @encoding: a free form C string describing the HTML document encoding, or NULL
6434 *
6435 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6436 * compressed document is provided by default if found at compile-time.
6437 *
6438 * Returns the resulting document tree
6439 */
6440
6441htmlDocPtr
6442htmlParseFile(const char *filename, const char *encoding) {
6443 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6444}
6445
6446/**
6447 * htmlHandleOmittedElem:
6448 * @val: int 0 or 1
6449 *
6450 * Set and return the previous value for handling HTML omitted tags.
6451 *
6452 * Returns the last value for 0 for no handling, 1 for auto insertion.
6453 */
6454
6455int
6456htmlHandleOmittedElem(int val) {
6457 int old = htmlOmittedDefaultValue;
6458
6459 htmlOmittedDefaultValue = val;
6460 return(old);
6461}
6462
6463/**
6464 * htmlElementAllowedHere:
6465 * @parent: HTML parent element
6466 * @elt: HTML element
6467 *
6468 * Checks whether an HTML element may be a direct child of a parent element.
6469 * Note - doesn't check for deprecated elements
6470 *
6471 * Returns 1 if allowed; 0 otherwise.
6472 */
6473int
6474htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6475 const char** p ;
6476
6477 if ( ! elt || ! parent || ! parent->subelts )
6478 return 0 ;
6479
6480 for ( p = parent->subelts; *p; ++p )
6481 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6482 return 1 ;
6483
6484 return 0 ;
6485}
6486/**
6487 * htmlElementStatusHere:
6488 * @parent: HTML parent element
6489 * @elt: HTML element
6490 *
6491 * Checks whether an HTML element may be a direct child of a parent element.
6492 * and if so whether it is valid or deprecated.
6493 *
6494 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6495 */
6496htmlStatus
6497htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6498 if ( ! parent || ! elt )
6499 return HTML_INVALID ;
6500 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6501 return HTML_INVALID ;
6502
6503 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6504}
6505/**
6506 * htmlAttrAllowed:
6507 * @elt: HTML element
6508 * @attr: HTML attribute
6509 * @legacy: whether to allow deprecated attributes
6510 *
6511 * Checks whether an attribute is valid for an element
6512 * Has full knowledge of Required and Deprecated attributes
6513 *
6514 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6515 */
6516htmlStatus
6517htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6518 const char** p ;
6519
6520 if ( !elt || ! attr )
6521 return HTML_INVALID ;
6522
6523 if ( elt->attrs_req )
6524 for ( p = elt->attrs_req; *p; ++p)
6525 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6526 return HTML_REQUIRED ;
6527
6528 if ( elt->attrs_opt )
6529 for ( p = elt->attrs_opt; *p; ++p)
6530 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6531 return HTML_VALID ;
6532
6533 if ( legacy && elt->attrs_depr )
6534 for ( p = elt->attrs_depr; *p; ++p)
6535 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6536 return HTML_DEPRECATED ;
6537
6538 return HTML_INVALID ;
6539}
6540/**
6541 * htmlNodeStatus:
6542 * @node: an htmlNodePtr in a tree
6543 * @legacy: whether to allow deprecated elements (YES is faster here
6544 * for Element nodes)
6545 *
6546 * Checks whether the tree node is valid. Experimental (the author
6547 * only uses the HTML enhancements in a SAX parser)
6548 *
6549 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6550 * legacy allowed) or htmlElementStatusHere (otherwise).
6551 * for Attribute nodes, a return from htmlAttrAllowed
6552 * for other nodes, HTML_NA (no checks performed)
6553 */
6554htmlStatus
6555htmlNodeStatus(const htmlNodePtr node, int legacy) {
6556 if ( ! node )
6557 return HTML_INVALID ;
6558
6559 switch ( node->type ) {
6560 case XML_ELEMENT_NODE:
6561 return legacy
6562 ? ( htmlElementAllowedHere (
6563 htmlTagLookup(node->parent->name) , node->name
6564 ) ? HTML_VALID : HTML_INVALID )
6565 : htmlElementStatusHere(
6566 htmlTagLookup(node->parent->name) ,
6567 htmlTagLookup(node->name) )
6568 ;
6569 case XML_ATTRIBUTE_NODE:
6570 return htmlAttrAllowed(
6571 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6572 default: return HTML_NA ;
6573 }
6574}
6575/************************************************************************
6576 * *
6577 * New set (2.6.0) of simpler and more flexible APIs *
6578 * *
6579 ************************************************************************/
6580/**
6581 * DICT_FREE:
6582 * @str: a string
6583 *
6584 * Free a string if it is not owned by the "dict" dictionary in the
6585 * current scope
6586 */
6587#define DICT_FREE(str) \
6588 if ((str) && ((!dict) || \
6589 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6590 xmlFree((char *)(str));
6591
6592/**
6593 * htmlCtxtReset:
6594 * @ctxt: an HTML parser context
6595 *
6596 * Reset a parser context
6597 */
6598void
6599htmlCtxtReset(htmlParserCtxtPtr ctxt)
6600{
6601 xmlParserInputPtr input;
6602 xmlDictPtr dict;
6603
6604 if (ctxt == NULL)
6605 return;
6606
6607 xmlInitParser();
6608 dict = ctxt->dict;
6609
6610 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6611 xmlFreeInputStream(input);
6612 }
6613 ctxt->inputNr = 0;
6614 ctxt->input = NULL;
6615
6616 ctxt->spaceNr = 0;
6617 if (ctxt->spaceTab != NULL) {
6618 ctxt->spaceTab[0] = -1;
6619 ctxt->space = &ctxt->spaceTab[0];
6620 } else {
6621 ctxt->space = NULL;
6622 }
6623
6624
6625 ctxt->nodeNr = 0;
6626 ctxt->node = NULL;
6627
6628 ctxt->nameNr = 0;
6629 ctxt->name = NULL;
6630
6631 DICT_FREE(ctxt->version);
6632 ctxt->version = NULL;
6633 DICT_FREE(ctxt->encoding);
6634 ctxt->encoding = NULL;
6635 DICT_FREE(ctxt->directory);
6636 ctxt->directory = NULL;
6637 DICT_FREE(ctxt->extSubURI);
6638 ctxt->extSubURI = NULL;
6639 DICT_FREE(ctxt->extSubSystem);
6640 ctxt->extSubSystem = NULL;
6641 if (ctxt->myDoc != NULL)
6642 xmlFreeDoc(ctxt->myDoc);
6643 ctxt->myDoc = NULL;
6644
6645 ctxt->standalone = -1;
6646 ctxt->hasExternalSubset = 0;
6647 ctxt->hasPErefs = 0;
6648 ctxt->html = 1;
6649 ctxt->external = 0;
6650 ctxt->instate = XML_PARSER_START;
6651 ctxt->token = 0;
6652
6653 ctxt->wellFormed = 1;
6654 ctxt->nsWellFormed = 1;
6655 ctxt->disableSAX = 0;
6656 ctxt->valid = 1;
6657 ctxt->vctxt.userData = ctxt;
6658 ctxt->vctxt.error = xmlParserValidityError;
6659 ctxt->vctxt.warning = xmlParserValidityWarning;
6660 ctxt->record_info = 0;
6661 ctxt->nbChars = 0;
6662 ctxt->checkIndex = 0;
6663 ctxt->inSubset = 0;
6664 ctxt->errNo = XML_ERR_OK;
6665 ctxt->depth = 0;
6666 ctxt->charset = XML_CHAR_ENCODING_NONE;
6667 ctxt->catalogs = NULL;
6668 xmlInitNodeInfoSeq(&ctxt->node_seq);
6669
6670 if (ctxt->attsDefault != NULL) {
6671 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6672 ctxt->attsDefault = NULL;
6673 }
6674 if (ctxt->attsSpecial != NULL) {
6675 xmlHashFree(ctxt->attsSpecial, NULL);
6676 ctxt->attsSpecial = NULL;
6677 }
6678}
6679
6680/**
6681 * htmlCtxtUseOptions:
6682 * @ctxt: an HTML parser context
6683 * @options: a combination of htmlParserOption(s)
6684 *
6685 * Applies the options to the parser context
6686 *
6687 * Returns 0 in case of success, the set of unknown or unimplemented options
6688 * in case of error.
6689 */
6690int
6691htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6692{
6693 if (ctxt == NULL)
6694 return(-1);
6695
6696 if (options & HTML_PARSE_NOWARNING) {
6697 ctxt->sax->warning = NULL;
6698 ctxt->vctxt.warning = NULL;
6699 options -= XML_PARSE_NOWARNING;
6700 ctxt->options |= XML_PARSE_NOWARNING;
6701 }
6702 if (options & HTML_PARSE_NOERROR) {
6703 ctxt->sax->error = NULL;
6704 ctxt->vctxt.error = NULL;
6705 ctxt->sax->fatalError = NULL;
6706 options -= XML_PARSE_NOERROR;
6707 ctxt->options |= XML_PARSE_NOERROR;
6708 }
6709 if (options & HTML_PARSE_PEDANTIC) {
6710 ctxt->pedantic = 1;
6711 options -= XML_PARSE_PEDANTIC;
6712 ctxt->options |= XML_PARSE_PEDANTIC;
6713 } else
6714 ctxt->pedantic = 0;
6715 if (options & XML_PARSE_NOBLANKS) {
6716 ctxt->keepBlanks = 0;
6717 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6718 options -= XML_PARSE_NOBLANKS;
6719 ctxt->options |= XML_PARSE_NOBLANKS;
6720 } else
6721 ctxt->keepBlanks = 1;
6722 if (options & HTML_PARSE_RECOVER) {
6723 ctxt->recovery = 1;
6724 options -= HTML_PARSE_RECOVER;
6725 } else
6726 ctxt->recovery = 0;
6727 if (options & HTML_PARSE_COMPACT) {
6728 ctxt->options |= HTML_PARSE_COMPACT;
6729 options -= HTML_PARSE_COMPACT;
6730 }
6731 if (options & XML_PARSE_HUGE) {
6732 ctxt->options |= XML_PARSE_HUGE;
6733 options -= XML_PARSE_HUGE;
6734 }
6735 if (options & HTML_PARSE_NODEFDTD) {
6736 ctxt->options |= HTML_PARSE_NODEFDTD;
6737 options -= HTML_PARSE_NODEFDTD;
6738 }
6739 if (options & HTML_PARSE_IGNORE_ENC) {
6740 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6741 options -= HTML_PARSE_IGNORE_ENC;
6742 }
6743 if (options & HTML_PARSE_NOIMPLIED) {
6744 ctxt->options |= HTML_PARSE_NOIMPLIED;
6745 options -= HTML_PARSE_NOIMPLIED;
6746 }
6747 ctxt->dictNames = 0;
6748 return (options);
6749}
6750
6751/**
6752 * htmlDoRead:
6753 * @ctxt: an HTML parser context
6754 * @URL: the base URL to use for the document
6755 * @encoding: the document encoding, or NULL
6756 * @options: a combination of htmlParserOption(s)
6757 * @reuse: keep the context for reuse
6758 *
6759 * Common front-end for the htmlRead functions
6760 *
6761 * Returns the resulting document tree or NULL
6762 */
6763static htmlDocPtr
6764htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6765 int options, int reuse)
6766{
6767 htmlDocPtr ret;
6768
6769 htmlCtxtUseOptions(ctxt, options);
6770 ctxt->html = 1;
6771 if (encoding != NULL) {
6772 xmlCharEncodingHandlerPtr hdlr;
6773
6774 hdlr = xmlFindCharEncodingHandler(encoding);
6775 if (hdlr != NULL) {
6776 xmlSwitchToEncoding(ctxt, hdlr);
6777 if (ctxt->input->encoding != NULL)
6778 xmlFree((xmlChar *) ctxt->input->encoding);
6779 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6780 }
6781 }
6782 if ((URL != NULL) && (ctxt->input != NULL) &&
6783 (ctxt->input->filename == NULL))
6784 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6785 htmlParseDocument(ctxt);
6786 ret = ctxt->myDoc;
6787 ctxt->myDoc = NULL;
6788 if (!reuse) {
6789 if ((ctxt->dictNames) &&
6790 (ret != NULL) &&
6791 (ret->dict == ctxt->dict))
6792 ctxt->dict = NULL;
6793 xmlFreeParserCtxt(ctxt);
6794 }
6795 return (ret);
6796}
6797
6798/**
6799 * htmlReadDoc:
6800 * @cur: a pointer to a zero terminated string
6801 * @URL: the base URL to use for the document
6802 * @encoding: the document encoding, or NULL
6803 * @options: a combination of htmlParserOption(s)
6804 *
6805 * parse an XML in-memory document and build a tree.
6806 *
6807 * Returns the resulting document tree
6808 */
6809htmlDocPtr
6810htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6811{
6812 htmlParserCtxtPtr ctxt;
6813
6814 if (cur == NULL)
6815 return (NULL);
6816
6817 xmlInitParser();
6818 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6819 if (ctxt == NULL)
6820 return (NULL);
6821 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6822}
6823
6824/**
6825 * htmlReadFile:
6826 * @filename: a file or URL
6827 * @encoding: the document encoding, or NULL
6828 * @options: a combination of htmlParserOption(s)
6829 *
6830 * parse an XML file from the filesystem or the network.
6831 *
6832 * Returns the resulting document tree
6833 */
6834htmlDocPtr
6835htmlReadFile(const char *filename, const char *encoding, int options)
6836{
6837 htmlParserCtxtPtr ctxt;
6838
6839 xmlInitParser();
6840 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6841 if (ctxt == NULL)
6842 return (NULL);
6843 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6844}
6845
6846/**
6847 * htmlReadMemory:
6848 * @buffer: a pointer to a char array
6849 * @size: the size of the array
6850 * @URL: the base URL to use for the document
6851 * @encoding: the document encoding, or NULL
6852 * @options: a combination of htmlParserOption(s)
6853 *
6854 * parse an XML in-memory document and build a tree.
6855 *
6856 * Returns the resulting document tree
6857 */
6858htmlDocPtr
6859htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6860{
6861 htmlParserCtxtPtr ctxt;
6862
6863 xmlInitParser();
6864 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6865 if (ctxt == NULL)
6866 return (NULL);
6867 htmlDefaultSAXHandlerInit();
6868 if (ctxt->sax != NULL)
6869 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6870 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6871}
6872
6873/**
6874 * htmlReadFd:
6875 * @fd: an open file descriptor
6876 * @URL: the base URL to use for the document
6877 * @encoding: the document encoding, or NULL
6878 * @options: a combination of htmlParserOption(s)
6879 *
6880 * parse an XML from a file descriptor and build a tree.
6881 *
6882 * Returns the resulting document tree
6883 */
6884htmlDocPtr
6885htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6886{
6887 htmlParserCtxtPtr ctxt;
6888 xmlParserInputBufferPtr input;
6889 xmlParserInputPtr stream;
6890
6891 if (fd < 0)
6892 return (NULL);
6893 xmlInitParser();
6894
6895 xmlInitParser();
6896 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6897 if (input == NULL)
6898 return (NULL);
6899 ctxt = xmlNewParserCtxt();
6900 if (ctxt == NULL) {
6901 xmlFreeParserInputBuffer(input);
6902 return (NULL);
6903 }
6904 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6905 if (stream == NULL) {
6906 xmlFreeParserInputBuffer(input);
6907 xmlFreeParserCtxt(ctxt);
6908 return (NULL);
6909 }
6910 inputPush(ctxt, stream);
6911 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6912}
6913
6914/**
6915 * htmlReadIO:
6916 * @ioread: an I/O read function
6917 * @ioclose: an I/O close function
6918 * @ioctx: an I/O handler
6919 * @URL: the base URL to use for the document
6920 * @encoding: the document encoding, or NULL
6921 * @options: a combination of htmlParserOption(s)
6922 *
6923 * parse an HTML document from I/O functions and source and build a tree.
6924 *
6925 * Returns the resulting document tree
6926 */
6927htmlDocPtr
6928htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6929 void *ioctx, const char *URL, const char *encoding, int options)
6930{
6931 htmlParserCtxtPtr ctxt;
6932 xmlParserInputBufferPtr input;
6933 xmlParserInputPtr stream;
6934
6935 if (ioread == NULL)
6936 return (NULL);
6937 xmlInitParser();
6938
6939 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6940 XML_CHAR_ENCODING_NONE);
6941 if (input == NULL) {
6942 if (ioclose != NULL)
6943 ioclose(ioctx);
6944 return (NULL);
6945 }
6946 ctxt = htmlNewParserCtxt();
6947 if (ctxt == NULL) {
6948 xmlFreeParserInputBuffer(input);
6949 return (NULL);
6950 }
6951 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6952 if (stream == NULL) {
6953 xmlFreeParserInputBuffer(input);
6954 xmlFreeParserCtxt(ctxt);
6955 return (NULL);
6956 }
6957 inputPush(ctxt, stream);
6958 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6959}
6960
6961/**
6962 * htmlCtxtReadDoc:
6963 * @ctxt: an HTML parser context
6964 * @cur: a pointer to a zero terminated string
6965 * @URL: the base URL to use for the document
6966 * @encoding: the document encoding, or NULL
6967 * @options: a combination of htmlParserOption(s)
6968 *
6969 * parse an XML in-memory document and build a tree.
6970 * This reuses the existing @ctxt parser context
6971 *
6972 * Returns the resulting document tree
6973 */
6974htmlDocPtr
6975htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6976 const char *URL, const char *encoding, int options)
6977{
6978 xmlParserInputPtr stream;
6979
6980 if (cur == NULL)
6981 return (NULL);
6982 if (ctxt == NULL)
6983 return (NULL);
6984 xmlInitParser();
6985
6986 htmlCtxtReset(ctxt);
6987
6988 stream = xmlNewStringInputStream(ctxt, cur);
6989 if (stream == NULL) {
6990 return (NULL);
6991 }
6992 inputPush(ctxt, stream);
6993 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6994}
6995
6996/**
6997 * htmlCtxtReadFile:
6998 * @ctxt: an HTML parser context
6999 * @filename: a file or URL
7000 * @encoding: the document encoding, or NULL
7001 * @options: a combination of htmlParserOption(s)
7002 *
7003 * parse an XML file from the filesystem or the network.
7004 * This reuses the existing @ctxt parser context
7005 *
7006 * Returns the resulting document tree
7007 */
7008htmlDocPtr
7009htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7010 const char *encoding, int options)
7011{
7012 xmlParserInputPtr stream;
7013
7014 if (filename == NULL)
7015 return (NULL);
7016 if (ctxt == NULL)
7017 return (NULL);
7018 xmlInitParser();
7019
7020 htmlCtxtReset(ctxt);
7021
7022 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7023 if (stream == NULL) {
7024 return (NULL);
7025 }
7026 inputPush(ctxt, stream);
7027 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7028}
7029
7030/**
7031 * htmlCtxtReadMemory:
7032 * @ctxt: an HTML parser context
7033 * @buffer: a pointer to a char array
7034 * @size: the size of the array
7035 * @URL: the base URL to use for the document
7036 * @encoding: the document encoding, or NULL
7037 * @options: a combination of htmlParserOption(s)
7038 *
7039 * parse an XML in-memory document and build a tree.
7040 * This reuses the existing @ctxt parser context
7041 *
7042 * Returns the resulting document tree
7043 */
7044htmlDocPtr
7045htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7046 const char *URL, const char *encoding, int options)
7047{
7048 xmlParserInputBufferPtr input;
7049 xmlParserInputPtr stream;
7050
7051 if (ctxt == NULL)
7052 return (NULL);
7053 if (buffer == NULL)
7054 return (NULL);
7055 xmlInitParser();
7056
7057 htmlCtxtReset(ctxt);
7058
7059 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7060 if (input == NULL) {
7061 return(NULL);
7062 }
7063
7064 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7065 if (stream == NULL) {
7066 xmlFreeParserInputBuffer(input);
7067 return(NULL);
7068 }
7069
7070 inputPush(ctxt, stream);
7071 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7072}
7073
7074/**
7075 * htmlCtxtReadFd:
7076 * @ctxt: an HTML parser context
7077 * @fd: an open file descriptor
7078 * @URL: the base URL to use for the document
7079 * @encoding: the document encoding, or NULL
7080 * @options: a combination of htmlParserOption(s)
7081 *
7082 * parse an XML from a file descriptor and build a tree.
7083 * This reuses the existing @ctxt parser context
7084 *
7085 * Returns the resulting document tree
7086 */
7087htmlDocPtr
7088htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7089 const char *URL, const char *encoding, int options)
7090{
7091 xmlParserInputBufferPtr input;
7092 xmlParserInputPtr stream;
7093
7094 if (fd < 0)
7095 return (NULL);
7096 if (ctxt == NULL)
7097 return (NULL);
7098 xmlInitParser();
7099
7100 htmlCtxtReset(ctxt);
7101
7102
7103 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7104 if (input == NULL)
7105 return (NULL);
7106 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7107 if (stream == NULL) {
7108 xmlFreeParserInputBuffer(input);
7109 return (NULL);
7110 }
7111 inputPush(ctxt, stream);
7112 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7113}
7114
7115/**
7116 * htmlCtxtReadIO:
7117 * @ctxt: an HTML parser context
7118 * @ioread: an I/O read function
7119 * @ioclose: an I/O close function
7120 * @ioctx: an I/O handler
7121 * @URL: the base URL to use for the document
7122 * @encoding: the document encoding, or NULL
7123 * @options: a combination of htmlParserOption(s)
7124 *
7125 * parse an HTML document from I/O functions and source and build a tree.
7126 * This reuses the existing @ctxt parser context
7127 *
7128 * Returns the resulting document tree
7129 */
7130htmlDocPtr
7131htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7132 xmlInputCloseCallback ioclose, void *ioctx,
7133 const char *URL,
7134 const char *encoding, int options)
7135{
7136 xmlParserInputBufferPtr input;
7137 xmlParserInputPtr stream;
7138
7139 if (ioread == NULL)
7140 return (NULL);
7141 if (ctxt == NULL)
7142 return (NULL);
7143 xmlInitParser();
7144
7145 htmlCtxtReset(ctxt);
7146
7147 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7148 XML_CHAR_ENCODING_NONE);
7149 if (input == NULL) {
7150 if (ioclose != NULL)
7151 ioclose(ioctx);
7152 return (NULL);
7153 }
7154 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7155 if (stream == NULL) {
7156 xmlFreeParserInputBuffer(input);
7157 return (NULL);
7158 }
7159 inputPush(ctxt, stream);
7160 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7161}
7162
7163#define bottom_HTMLparser
7164#include "elfgcchack.h"
7165#endif /* LIBXML_HTML_ENABLED */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette