1 | * Summary: interface for an HTML 4.0 non-verifying parser
|
---|
2 | * Description: this module implements an HTML 4.0 non-verifying parser
|
---|
3 | * with API compatible with the XML parser ones. It should
|
---|
4 | * be able to parse "real world" HTML, even if severely
|
---|
5 | * broken from a specification point of view.
|
---|
6 | *
|
---|
7 | * Copy: See Copyright for the status of this software.
|
---|
8 | *
|
---|
9 | * Author: Patrick Monnerat <[email protected]>, DATASPHERE S.A.
|
---|
10 |
|
---|
11 | /if not defined(HTML_PARSER_H__)
|
---|
12 | /define HTML_PARSER_H__
|
---|
13 |
|
---|
14 | /include "libxmlrpg/xmlversion"
|
---|
15 |
|
---|
16 | /if defined(LIBXML_HTML_ENABLED)
|
---|
17 |
|
---|
18 | /include "libxmlrpg/xmlTypesC"
|
---|
19 | /include "libxmlrpg/parser"
|
---|
20 |
|
---|
21 | * Most of the back-end structures from XML and HTML are shared.
|
---|
22 |
|
---|
23 | d htmlParserCtxtPtr...
|
---|
24 | d s based(######typedef######)
|
---|
25 | d like(xmlParserCtxtPtr)
|
---|
26 |
|
---|
27 | d htmlParserCtxt ds based(htmlParserCtxtPtr)
|
---|
28 | d likeds(xmlParserCtxt)
|
---|
29 |
|
---|
30 | d htmlParserNodeInfoPtr...
|
---|
31 | d s based(######typedef######)
|
---|
32 | d like(xmlParserNodeInfoPtr)
|
---|
33 |
|
---|
34 | d htmlParserNodeInfo...
|
---|
35 | d ds based(htmlParserNodeInfoPtr)
|
---|
36 | d likeds(xmlParserNodeInfo)
|
---|
37 |
|
---|
38 | d htmlSAXHandlerPtr...
|
---|
39 | d s based(######typedef######)
|
---|
40 | d like(xmlSAXHandlerPtr)
|
---|
41 |
|
---|
42 | d htmlSAXHandler ds based(htmlSAXHandlerPtr)
|
---|
43 | d likeds(xmlSAXHandler)
|
---|
44 |
|
---|
45 | d htmlParserInputPtr...
|
---|
46 | d s based(######typedef######)
|
---|
47 | d like(xmlParserInputPtr)
|
---|
48 |
|
---|
49 | d htmlParserInput...
|
---|
50 | d ds based(htmlParserInputPtr)
|
---|
51 | d likeds(xmlParserInput)
|
---|
52 |
|
---|
53 | d htmlDocPtr s based(######typedef######)
|
---|
54 | d like(xmlDocPtr)
|
---|
55 |
|
---|
56 | d htmlNodePtr s based(######typedef######)
|
---|
57 | d like(xmlNodePtr)
|
---|
58 |
|
---|
59 | * Internal description of an HTML element, representing HTML 4.01
|
---|
60 | * and XHTML 1.0 (which share the same structure).
|
---|
61 |
|
---|
62 | d htmlElemDescPtr...
|
---|
63 | d s * based(######typedef######)
|
---|
64 |
|
---|
65 | d htmlElemDesc ds based(htmlElemDescPtr)
|
---|
66 | d align qualified
|
---|
67 | d name * const char *
|
---|
68 | d startTag like(xmlCchar) Start tag implied ?
|
---|
69 | d endTag like(xmlCchar) End tag implied ?
|
---|
70 | d saveEndTag like(xmlCchar) Save end tag ?
|
---|
71 | d empty like(xmlCchar) Empty element ?
|
---|
72 | d depr like(xmlCchar) Deprecated element ?
|
---|
73 | d dtd like(xmlCchar) Loose DTD/Frameset
|
---|
74 | d isinline like(xmlCchar) Block 0/inline elem?
|
---|
75 | d desc * const char *
|
---|
76 | *
|
---|
77 | * New fields encapsulating HTML structure
|
---|
78 | *
|
---|
79 | * Bugs:
|
---|
80 | * This is a very limited representation. It fails to tell us when
|
---|
81 | * an element *requires* subelements (we only have whether they're
|
---|
82 | * allowed or not), and it doesn't tell us where CDATA and PCDATA
|
---|
83 | * are allowed. Some element relationships are not fully represented:
|
---|
84 | * these are flagged with the word MODIFIER
|
---|
85 | *
|
---|
86 | d subelts * const char * *
|
---|
87 | d defaultsubelt * const char *
|
---|
88 | d attrs_opt * const char * *
|
---|
89 | d attrs_depr * const char * *
|
---|
90 | d attrs_req * const char * *
|
---|
91 |
|
---|
92 | * Internal description of an HTML entity.
|
---|
93 |
|
---|
94 | d htmlEntityDescPtr...
|
---|
95 | d s * based(######typedef######)
|
---|
96 |
|
---|
97 | d htmlEntityDesc...
|
---|
98 | d ds based(htmlEntityDescPtr)
|
---|
99 | d align qualified
|
---|
100 | d value like(xmlCuint)
|
---|
101 | d name * const char *
|
---|
102 | d desc * const char *
|
---|
103 |
|
---|
104 | * There is only few public functions.
|
---|
105 |
|
---|
106 | d htmlTagLookup pr extproc('htmlTagLookup')
|
---|
107 | d like(htmlElemDescPtr) const
|
---|
108 | d tag * value options(*string) const xmlChar *
|
---|
109 |
|
---|
110 | d htmlEntityLookup...
|
---|
111 | d pr extproc('htmlEntityLookup')
|
---|
112 | d like(htmlEntityDescPtr) const
|
---|
113 | d name * value options(*string) const xmlChar *
|
---|
114 |
|
---|
115 | d htmlEntityValueLookup...
|
---|
116 | d pr extproc('htmlEntityValueLookup')
|
---|
117 | d like(htmlEntityDescPtr) const
|
---|
118 | d value value like(xmlCuint)
|
---|
119 |
|
---|
120 | d htmlIsAutoClosed...
|
---|
121 | d pr extproc('htmlIsAutoClosed')
|
---|
122 | d like(xmlCint)
|
---|
123 | d doc value like(htmlDocPtr)
|
---|
124 | d elem value like(htmlNodePtr)
|
---|
125 |
|
---|
126 | d htmlAutoCloseTag...
|
---|
127 | d pr extproc('htmlAutoCloseTag')
|
---|
128 | d like(xmlCint)
|
---|
129 | d doc value like(htmlDocPtr)
|
---|
130 | d name * value options(*string) const xmlChar *
|
---|
131 | d elem value like(htmlNodePtr)
|
---|
132 |
|
---|
133 | d htmlParseEntityRef...
|
---|
134 | d pr extproc('htmlParseEntityRef')
|
---|
135 | d like(htmlEntityDescPtr) const
|
---|
136 | d ctxt value like(htmlParserCtxtPtr)
|
---|
137 | d str * const xmlChar *(*)
|
---|
138 |
|
---|
139 | d htmlParseCharRef...
|
---|
140 | d pr extproc('htmlParseCharRef')
|
---|
141 | d like(xmlCint)
|
---|
142 | d ctxt value like(htmlParserCtxtPtr)
|
---|
143 |
|
---|
144 | d htmlParseElement...
|
---|
145 | d pr extproc('htmlParseElement')
|
---|
146 | d ctxt value like(htmlParserCtxtPtr)
|
---|
147 |
|
---|
148 | d htmlNewParserCtxt...
|
---|
149 | d pr extproc('htmlNewParserCtxt')
|
---|
150 | d like(htmlParserCtxtPtr)
|
---|
151 |
|
---|
152 | d htmlCreateMemoryParserCtxt...
|
---|
153 | d pr extproc('htmlCreateMemoryParserCtxt')
|
---|
154 | d like(htmlParserCtxtPtr)
|
---|
155 | d buffer * value options(*string) const char *
|
---|
156 | d size value like(xmlCint)
|
---|
157 |
|
---|
158 | d htmlParseDocument...
|
---|
159 | d pr extproc('htmlParseDocument')
|
---|
160 | d like(xmlCint)
|
---|
161 | d ctxt value like(htmlParserCtxtPtr)
|
---|
162 |
|
---|
163 | d htmlSAXParseDoc...
|
---|
164 | d pr extproc('htmlSAXParseDoc')
|
---|
165 | d like(htmlDocPtr)
|
---|
166 | d cur * value options(*string) xmlChar *
|
---|
167 | d encoding * value options(*string) const char *
|
---|
168 | d sax value like(htmlSAXHandlerPtr)
|
---|
169 | d userData * value void *
|
---|
170 |
|
---|
171 | d htmlParseDoc pr extproc('htmlParseDoc')
|
---|
172 | d like(htmlDocPtr)
|
---|
173 | d cur * value options(*string) xmlChar *
|
---|
174 | d encoding * value options(*string) const char *
|
---|
175 |
|
---|
176 | d htmlSAXParseFile...
|
---|
177 | d pr extproc('htmlSAXParseFile')
|
---|
178 | d like(htmlDocPtr)
|
---|
179 | d filename * value options(*string) const char *
|
---|
180 | d encoding * value options(*string) const char *
|
---|
181 | d sax value like(htmlSAXHandlerPtr)
|
---|
182 | d userData * value void *
|
---|
183 |
|
---|
184 | d htmlParseFile pr extproc('htmlParseFile')
|
---|
185 | d like(htmlDocPtr)
|
---|
186 | d filename * value options(*string) const char *
|
---|
187 | d encoding * value options(*string) const char *
|
---|
188 |
|
---|
189 | d UTF8ToHtml pr extproc('UTF8ToHtml')
|
---|
190 | d like(xmlCint)
|
---|
191 | d out 65535 options(*varsize) unsigned char []
|
---|
192 | d outlen like(xmlCint)
|
---|
193 | d in * value options(*string) const unsigned char*
|
---|
194 | d inlen like(xmlCint)
|
---|
195 |
|
---|
196 | d htmlEncodeEntities...
|
---|
197 | d pr extproc('htmlEncodeEntities')
|
---|
198 | d like(xmlCint)
|
---|
199 | d out 65535 options(*varsize) unsigned char []
|
---|
200 | d outlen like(xmlCint)
|
---|
201 | d in * value options(*string) const unsigned char*
|
---|
202 | d inlen like(xmlCint)
|
---|
203 | d quoteChar value like(xmlCint)
|
---|
204 |
|
---|
205 | d htmlIsScriptAttribute...
|
---|
206 | d pr extproc('htmlIsScriptAttribute')
|
---|
207 | d like(xmlCint)
|
---|
208 | d name * value options(*string) const xmlChar *
|
---|
209 |
|
---|
210 | d htmlHandleOmittedElem...
|
---|
211 | d pr extproc('htmlHandleOmittedElem')
|
---|
212 | d like(xmlCint)
|
---|
213 | d val value like(xmlCint)
|
---|
214 |
|
---|
215 | /if defined(LIBXML_PUSH_ENABLED)
|
---|
216 |
|
---|
217 | * Interfaces for the Push mode.
|
---|
218 |
|
---|
219 | d htmlCreatePushParserCtxt...
|
---|
220 | d pr extproc('htmlCreatePushParserCtxt')
|
---|
221 | d like(htmlParserCtxtPtr)
|
---|
222 | d sax value like(htmlSAXHandlerPtr)
|
---|
223 | d user_data * value void *
|
---|
224 | d chunk * value options(*string) const char *
|
---|
225 | d size value like(xmlCint)
|
---|
226 | d filename * value options(*string) const char *
|
---|
227 | d enc value like(xmlCharEncoding)
|
---|
228 |
|
---|
229 | d htmlParseChunk pr extproc('htmlParseChunk')
|
---|
230 | d like(xmlCint)
|
---|
231 | d ctxt value like(htmlParserCtxtPtr)
|
---|
232 | d chunk * value options(*string) const char *
|
---|
233 | d size value like(xmlCint)
|
---|
234 | d terminate value like(xmlCint)
|
---|
235 | /endif LIBXML_PUSH_ENABLED
|
---|
236 |
|
---|
237 | d htmlFreeParserCtxt...
|
---|
238 | d pr extproc('htmlFreeParserCtxt')
|
---|
239 | d ctxt value like(htmlParserCtxtPtr)
|
---|
240 |
|
---|
241 | * New set of simpler/more flexible APIs
|
---|
242 |
|
---|
243 | * xmlParserOption:
|
---|
244 | *
|
---|
245 | * This is the set of XML parser options that can be passed down
|
---|
246 | * to the xmlReadDoc() and similar calls.
|
---|
247 |
|
---|
248 | d htmlParserOption...
|
---|
249 | d s based(######typedef######)
|
---|
250 | d like(xmlCenum)
|
---|
251 | d HTML_PARSE_RECOVER... Relaxed parsing
|
---|
252 | d c X'00000001'
|
---|
253 | d HTML_PARSE_NODEFDTD... No default doctype
|
---|
254 | d c X'00000004'
|
---|
255 | d HTML_PARSE_NOERROR... No error reports
|
---|
256 | d c X'00000020'
|
---|
257 | d HTML_PARSE_NOWARNING... No warning reports
|
---|
258 | d c X'00000040'
|
---|
259 | d HTML_PARSE_PEDANTIC... Pedantic err reports
|
---|
260 | d c X'00000080'
|
---|
261 | d HTML_PARSE_NOBLANKS... Remove blank nodes
|
---|
262 | d c X'00000100'
|
---|
263 | d HTML_PARSE_NONET... Forbid net access
|
---|
264 | d c X'00000800'
|
---|
265 | d HTML_PARSE_NOIMPLIED... No implied html/body
|
---|
266 | d c X'00002000'
|
---|
267 | d HTML_PARSE_COMPACT... compact small txtnod
|
---|
268 | d c X'00010000'
|
---|
269 | d HTML_PARSE_IGNORE_ENC... Ignore encoding hint
|
---|
270 | d c X'00200000'
|
---|
271 |
|
---|
272 | d htmlCtxtReset pr extproc('htmlCtxtReset')
|
---|
273 | d ctxt value like(htmlParserCtxtPtr)
|
---|
274 |
|
---|
275 | d htmlCtxtUseOptions...
|
---|
276 | d pr extproc('htmlCtxtUseOptions')
|
---|
277 | d like(xmlCint)
|
---|
278 | d ctxt value like(htmlParserCtxtPtr)
|
---|
279 | d options value like(xmlCint)
|
---|
280 |
|
---|
281 | d htmlReadDoc pr extproc('htmlReadDoc')
|
---|
282 | d like(htmlDocPtr)
|
---|
283 | d cur * value options(*string) const xmlChar *
|
---|
284 | d URL * value options(*string) const char *
|
---|
285 | d encoding * value options(*string) const char *
|
---|
286 | d options value like(xmlCint)
|
---|
287 |
|
---|
288 | d htmlReadFile pr extproc('htmlReadFile')
|
---|
289 | d like(htmlDocPtr)
|
---|
290 | d URL * value options(*string) const char *
|
---|
291 | d encoding * value options(*string) const char *
|
---|
292 | d options value like(xmlCint)
|
---|
293 |
|
---|
294 | d htmlReadMemory pr extproc('htmlReadMemory')
|
---|
295 | d like(htmlDocPtr)
|
---|
296 | d buffer * value options(*string) const char *
|
---|
297 | d size value like(xmlCint)
|
---|
298 | d URL * value options(*string) const char *
|
---|
299 | d encoding * value options(*string) const char *
|
---|
300 | d options value like(xmlCint)
|
---|
301 |
|
---|
302 | d htmlReadFd pr extproc('htmlReadFd')
|
---|
303 | d like(htmlDocPtr)
|
---|
304 | d fd value like(xmlCint)
|
---|
305 | d URL * value options(*string) const char *
|
---|
306 | d encoding * value options(*string) const char *
|
---|
307 | d options value like(xmlCint)
|
---|
308 |
|
---|
309 | d htmlReadIO pr extproc('htmlReadIO')
|
---|
310 | d like(htmlDocPtr)
|
---|
311 | d ioread value like(xmlInputReadCallback)
|
---|
312 | d ioclose value like(xmlInputCloseCallback)
|
---|
313 | d ioctx * value void *
|
---|
314 | d URL * value options(*string) const char *
|
---|
315 | d encoding * value options(*string) const char *
|
---|
316 | d options value like(xmlCint)
|
---|
317 |
|
---|
318 | d htmlCtxtReadDoc...
|
---|
319 | d pr extproc('htmlCtxtReadDoc')
|
---|
320 | d like(htmlDocPtr)
|
---|
321 | d ctxt value like(xmlParserCtxtPtr)
|
---|
322 | d cur * value options(*string) const xmlChar *
|
---|
323 | d URL * value options(*string) const char *
|
---|
324 | d encoding * value options(*string) const char *
|
---|
325 | d options value like(xmlCint)
|
---|
326 |
|
---|
327 | d htmlCtxtReadFile...
|
---|
328 | d pr extproc('htmlCtxtReadFile')
|
---|
329 | d like(htmlDocPtr)
|
---|
330 | d ctxt value like(xmlParserCtxtPtr)
|
---|
331 | d filename * value options(*string) const char *
|
---|
332 | d encoding * value options(*string) const char *
|
---|
333 | d options value like(xmlCint)
|
---|
334 |
|
---|
335 | d htmlCtxtReadMemory...
|
---|
336 | d pr extproc('htmlCtxtReadMemory')
|
---|
337 | d like(htmlDocPtr)
|
---|
338 | d ctxt value like(xmlParserCtxtPtr)
|
---|
339 | d buffer * value options(*string) const char *
|
---|
340 | d size value like(xmlCint)
|
---|
341 | d URL * value options(*string) const char *
|
---|
342 | d encoding * value options(*string) const char *
|
---|
343 | d options value like(xmlCint)
|
---|
344 |
|
---|
345 | d htmlCtxtReadFd pr extproc('htmlCtxtReadFd')
|
---|
346 | d like(htmlDocPtr)
|
---|
347 | d ctxt value like(xmlParserCtxtPtr)
|
---|
348 | d fd value like(xmlCint)
|
---|
349 | d URL * value options(*string) const char *
|
---|
350 | d encoding * value options(*string) const char *
|
---|
351 | d options value like(xmlCint)
|
---|
352 |
|
---|
353 | d htmlCtxtReadIO pr extproc('htmlCtxtReadIO')
|
---|
354 | d like(htmlDocPtr)
|
---|
355 | d ctxt value like(xmlParserCtxtPtr)
|
---|
356 | d ioread value like(xmlInputReadCallback)
|
---|
357 | d ioclose value like(xmlInputCloseCallback)
|
---|
358 | d ioctx * value void *
|
---|
359 | d URL * value options(*string) const char *
|
---|
360 | d encoding * value options(*string) const char *
|
---|
361 | d options value like(xmlCint)
|
---|
362 |
|
---|
363 | * Further knowledge of HTML structure
|
---|
364 |
|
---|
365 | d htmlStatus s based(######typedef######)
|
---|
366 | d like(xmlCenum)
|
---|
367 | d HTML_NA c X'0000' No check at all
|
---|
368 | d HTML_INVALID c X'0001'
|
---|
369 | d HTML_DEPRECATED...
|
---|
370 | d c X'0002'
|
---|
371 | d HTML_VALID c X'0004'
|
---|
372 | d HTML_REQUIRED c X'000C' HTML_VALID ored-in
|
---|
373 |
|
---|
374 | * Using htmlElemDesc rather than name here, to emphasise the fact
|
---|
375 | * that otherwise there's a lookup overhead
|
---|
376 |
|
---|
377 | d htmlAttrAllowed...
|
---|
378 | d pr extproc('htmlAttrAllowed')
|
---|
379 | d like(htmlStatus)
|
---|
380 | d #param1 value like(htmlElemDescPtr) const
|
---|
381 | d #param2 * value options(*string) const xmlChar *
|
---|
382 | d #param3 value like(xmlCint)
|
---|
383 |
|
---|
384 | d htmlElementAllowedHere...
|
---|
385 | d pr extproc('htmlElementAllowedHere')
|
---|
386 | d like(xmlCint)
|
---|
387 | d #param1 value like(htmlElemDescPtr) const
|
---|
388 | d #param2 * value options(*string) const xmlChar *
|
---|
389 |
|
---|
390 | d htmlElementStatusHere...
|
---|
391 | d pr extproc('htmlElementStatusHere')
|
---|
392 | d like(htmlStatus)
|
---|
393 | d #param1 value like(htmlElemDescPtr) const
|
---|
394 | d #param2 value like(htmlElemDescPtr) const
|
---|
395 |
|
---|
396 | d htmlNodeStatus pr extproc('htmlNodeStatus')
|
---|
397 | d like(htmlStatus)
|
---|
398 | d #param1 value like(htmlNodePtr)
|
---|
399 | d #param2 value like(xmlCint)
|
---|
400 |
|
---|
401 | * C macros implemented as procedures for ILE/RPG support.
|
---|
402 |
|
---|
403 | d htmlDefaultSubelement...
|
---|
404 | d pr * extproc('__htmlDefaultSubelement') const char *
|
---|
405 | d elt * value const htmlElemDesc *
|
---|
406 |
|
---|
407 | d htmlElementAllowedHereDesc...
|
---|
408 | d pr extproc(
|
---|
409 | d '__htmlElementAllowedHereDesc')
|
---|
410 | d like(xmlCint)
|
---|
411 | d parent * value const htmlElemDesc *
|
---|
412 | d elt * value const htmlElemDesc *
|
---|
413 |
|
---|
414 | d htmlRequiredAttrs...
|
---|
415 | d pr * extproc('__htmlRequiredAttrs') const char * *
|
---|
416 | d elt * value const htmlElemDesc *
|
---|
417 |
|
---|
418 | /endif LIBXML_HTML_ENABLED
|
---|
419 | /endif HTML_PARSER_H__
|
---|