VirtualBox

source: vbox/trunk/src/libs/libxml2-2.13.2/include/libxml/HTMLparser.h

Last change on this file was 105420, checked in by vboxsync, 7 months ago

libxml2-2.12.6: Applied and adjusted our libxml2 changes to 2.12.6. bugref:10730

  • Property svn:eol-style set to native
File size: 9.6 KB
Line 
1/*
2 * Summary: interface for an HTML 4.0 non-verifying parser
3 * Description: this module implements an HTML 4.0 non-verifying parser
4 * with API compatible with the XML parser ones. It should
5 * be able to parse "real world" HTML, even if severely
6 * broken from a specification point of view.
7 *
8 * Copy: See Copyright for the status of this software.
9 *
10 * Author: Daniel Veillard
11 */
12
13#ifndef __HTML_PARSER_H__
14#define __HTML_PARSER_H__
15#include <libxml/xmlversion.h>
16#include <libxml/parser.h>
17
18#ifdef LIBXML_HTML_ENABLED
19
20#ifdef __cplusplus
21extern "C" {
22#endif
23
24/*
25 * Most of the back-end structures from XML and HTML are shared.
26 */
27typedef xmlParserCtxt htmlParserCtxt;
28typedef xmlParserCtxtPtr htmlParserCtxtPtr;
29typedef xmlParserNodeInfo htmlParserNodeInfo;
30typedef xmlSAXHandler htmlSAXHandler;
31typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
32typedef xmlParserInput htmlParserInput;
33typedef xmlParserInputPtr htmlParserInputPtr;
34typedef xmlDocPtr htmlDocPtr;
35typedef xmlNodePtr htmlNodePtr;
36
37/*
38 * Internal description of an HTML element, representing HTML 4.01
39 * and XHTML 1.0 (which share the same structure).
40 */
41typedef struct _htmlElemDesc htmlElemDesc;
42typedef htmlElemDesc *htmlElemDescPtr;
43struct _htmlElemDesc {
44 const char *name; /* The tag name */
45 char startTag; /* Whether the start tag can be implied */
46 char endTag; /* Whether the end tag can be implied */
47 char saveEndTag; /* Whether the end tag should be saved */
48 char empty; /* Is this an empty element ? */
49 char depr; /* Is this a deprecated element ? */
50 char dtd; /* 1: only in Loose DTD, 2: only Frameset one */
51 char isinline; /* is this a block 0 or inline 1 element */
52 const char *desc; /* the description */
53
54/* NRK Jan.2003
55 * New fields encapsulating HTML structure
56 *
57 * Bugs:
58 * This is a very limited representation. It fails to tell us when
59 * an element *requires* subelements (we only have whether they're
60 * allowed or not), and it doesn't tell us where CDATA and PCDATA
61 * are allowed. Some element relationships are not fully represented:
62 * these are flagged with the word MODIFIER
63 */
64 const char** subelts; /* allowed sub-elements of this element */
65 const char* defaultsubelt; /* subelement for suggested auto-repair
66 if necessary or NULL */
67 const char** attrs_opt; /* Optional Attributes */
68 const char** attrs_depr; /* Additional deprecated attributes */
69 const char** attrs_req; /* Required attributes */
70};
71
72/*
73 * Internal description of an HTML entity.
74 */
75typedef struct _htmlEntityDesc htmlEntityDesc;
76typedef htmlEntityDesc *htmlEntityDescPtr;
77struct _htmlEntityDesc {
78 unsigned int value; /* the UNICODE value for the character */
79 const char *name; /* The entity name */
80 const char *desc; /* the description */
81};
82
83#ifdef LIBXML_SAX1_ENABLED
84
85XML_DEPRECATED
86XMLPUBVAR const xmlSAXHandlerV1 htmlDefaultSAXHandler;
87
88#ifdef LIBXML_THREAD_ENABLED
89XML_DEPRECATED
90XMLPUBFUN const xmlSAXHandlerV1 *__htmlDefaultSAXHandler(void);
91#endif
92
93#endif /* LIBXML_SAX1_ENABLED */
94
95/*
96 * There is only few public functions.
97 */
98XML_DEPRECATED
99XMLPUBFUN void
100 htmlInitAutoClose (void);
101XMLPUBFUN const htmlElemDesc *
102 htmlTagLookup (const xmlChar *tag);
103XMLPUBFUN const htmlEntityDesc *
104 htmlEntityLookup(const xmlChar *name);
105XMLPUBFUN const htmlEntityDesc *
106 htmlEntityValueLookup(unsigned int value);
107
108XMLPUBFUN int
109 htmlIsAutoClosed(htmlDocPtr doc,
110 htmlNodePtr elem);
111XMLPUBFUN int
112 htmlAutoCloseTag(htmlDocPtr doc,
113 const xmlChar *name,
114 htmlNodePtr elem);
115XML_DEPRECATED
116XMLPUBFUN const htmlEntityDesc *
117 htmlParseEntityRef(htmlParserCtxtPtr ctxt,
118 const xmlChar **str);
119XML_DEPRECATED
120XMLPUBFUN int
121 htmlParseCharRef(htmlParserCtxtPtr ctxt);
122XML_DEPRECATED
123XMLPUBFUN void
124 htmlParseElement(htmlParserCtxtPtr ctxt);
125
126XMLPUBFUN htmlParserCtxtPtr
127 htmlNewParserCtxt(void);
128XMLPUBFUN htmlParserCtxtPtr
129 htmlNewSAXParserCtxt(const htmlSAXHandler *sax,
130 void *userData);
131
132XMLPUBFUN htmlParserCtxtPtr
133 htmlCreateMemoryParserCtxt(const char *buffer,
134 int size);
135
136XMLPUBFUN int
137 htmlParseDocument(htmlParserCtxtPtr ctxt);
138XML_DEPRECATED
139XMLPUBFUN htmlDocPtr
140 htmlSAXParseDoc (const xmlChar *cur,
141 const char *encoding,
142 htmlSAXHandlerPtr sax,
143 void *userData);
144XMLPUBFUN htmlDocPtr
145 htmlParseDoc (const xmlChar *cur,
146 const char *encoding);
147XMLPUBFUN htmlParserCtxtPtr
148 htmlCreateFileParserCtxt(const char *filename,
149 const char *encoding);
150XML_DEPRECATED
151XMLPUBFUN htmlDocPtr
152 htmlSAXParseFile(const char *filename,
153 const char *encoding,
154 htmlSAXHandlerPtr sax,
155 void *userData);
156XMLPUBFUN htmlDocPtr
157 htmlParseFile (const char *filename,
158 const char *encoding);
159XMLPUBFUN int
160 UTF8ToHtml (unsigned char *out,
161 int *outlen,
162 const unsigned char *in,
163 int *inlen);
164XMLPUBFUN int
165 htmlEncodeEntities(unsigned char *out,
166 int *outlen,
167 const unsigned char *in,
168 int *inlen, int quoteChar);
169XMLPUBFUN int
170 htmlIsScriptAttribute(const xmlChar *name);
171XML_DEPRECATED
172XMLPUBFUN int
173 htmlHandleOmittedElem(int val);
174
175#ifdef LIBXML_PUSH_ENABLED
176/**
177 * Interfaces for the Push mode.
178 */
179XMLPUBFUN htmlParserCtxtPtr
180 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
181 void *user_data,
182 const char *chunk,
183 int size,
184 const char *filename,
185 xmlCharEncoding enc);
186XMLPUBFUN int
187 htmlParseChunk (htmlParserCtxtPtr ctxt,
188 const char *chunk,
189 int size,
190 int terminate);
191#endif /* LIBXML_PUSH_ENABLED */
192
193XMLPUBFUN void
194 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);
195
196/*
197 * New set of simpler/more flexible APIs
198 */
199/**
200 * xmlParserOption:
201 *
202 * This is the set of XML parser options that can be passed down
203 * to the xmlReadDoc() and similar calls.
204 */
205typedef enum {
206 HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */
207 HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */
208 HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */
209 HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
210 HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
211 HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
212 HTML_PARSE_NONET = 1<<11,/* Forbid network access */
213 HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */
214 HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */
215 HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */
216} htmlParserOption;
217
218XMLPUBFUN void
219 htmlCtxtReset (htmlParserCtxtPtr ctxt);
220XMLPUBFUN int
221 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
222 int options);
223XMLPUBFUN htmlDocPtr
224 htmlReadDoc (const xmlChar *cur,
225 const char *URL,
226 const char *encoding,
227 int options);
228XMLPUBFUN htmlDocPtr
229 htmlReadFile (const char *URL,
230 const char *encoding,
231 int options);
232XMLPUBFUN htmlDocPtr
233 htmlReadMemory (const char *buffer,
234 int size,
235 const char *URL,
236 const char *encoding,
237 int options);
238XMLPUBFUN htmlDocPtr
239 htmlReadFd (int fd,
240 const char *URL,
241 const char *encoding,
242 int options);
243XMLPUBFUN htmlDocPtr
244 htmlReadIO (xmlInputReadCallback ioread,
245 xmlInputCloseCallback ioclose,
246 void *ioctx,
247 const char *URL,
248 const char *encoding,
249 int options);
250XMLPUBFUN htmlDocPtr
251 htmlCtxtParseDocument (htmlParserCtxtPtr ctxt,
252 xmlParserInputPtr input);
253XMLPUBFUN htmlDocPtr
254 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt,
255 const xmlChar *cur,
256 const char *URL,
257 const char *encoding,
258 int options);
259XMLPUBFUN htmlDocPtr
260 htmlCtxtReadFile (xmlParserCtxtPtr ctxt,
261 const char *filename,
262 const char *encoding,
263 int options);
264XMLPUBFUN htmlDocPtr
265 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt,
266 const char *buffer,
267 int size,
268 const char *URL,
269 const char *encoding,
270 int options);
271XMLPUBFUN htmlDocPtr
272 htmlCtxtReadFd (xmlParserCtxtPtr ctxt,
273 int fd,
274 const char *URL,
275 const char *encoding,
276 int options);
277XMLPUBFUN htmlDocPtr
278 htmlCtxtReadIO (xmlParserCtxtPtr ctxt,
279 xmlInputReadCallback ioread,
280 xmlInputCloseCallback ioclose,
281 void *ioctx,
282 const char *URL,
283 const char *encoding,
284 int options);
285
286/* NRK/Jan2003: further knowledge of HTML structure
287 */
288typedef enum {
289 HTML_NA = 0 , /* something we don't check at all */
290 HTML_INVALID = 0x1 ,
291 HTML_DEPRECATED = 0x2 ,
292 HTML_VALID = 0x4 ,
293 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
294} htmlStatus ;
295
296/* Using htmlElemDesc rather than name here, to emphasise the fact
297 that otherwise there's a lookup overhead
298*/
299XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
300XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
301XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
302XMLPUBFUN htmlStatus htmlNodeStatus(htmlNodePtr, int) ;
303/**
304 * htmlDefaultSubelement:
305 * @elt: HTML element
306 *
307 * Returns the default subelement for this element
308 */
309#define htmlDefaultSubelement(elt) elt->defaultsubelt
310/**
311 * htmlElementAllowedHereDesc:
312 * @parent: HTML parent element
313 * @elt: HTML element
314 *
315 * Checks whether an HTML element description may be a
316 * direct child of the specified element.
317 *
318 * Returns 1 if allowed; 0 otherwise.
319 */
320#define htmlElementAllowedHereDesc(parent,elt) \
321 htmlElementAllowedHere((parent), (elt)->name)
322/**
323 * htmlRequiredAttrs:
324 * @elt: HTML element
325 *
326 * Returns the attributes required for the specified element.
327 */
328#define htmlRequiredAttrs(elt) (elt)->attrs_req
329
330
331#ifdef __cplusplus
332}
333#endif
334
335#endif /* LIBXML_HTML_ENABLED */
336#endif /* __HTML_PARSER_H__ */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette