1 | <?xml version="1.0"?>
|
---|
2 | <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN"
|
---|
3 | "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd" [
|
---|
4 | <!ENTITY KEYWORD SYSTEM "includekeyword.c">
|
---|
5 | <!ENTITY XPATH SYSTEM "includexpath.c">
|
---|
6 | <!ENTITY STORY SYSTEM "includestory.xml">
|
---|
7 | <!ENTITY ADDKEYWORD SYSTEM "includeaddkeyword.c">
|
---|
8 | <!ENTITY ADDATTRIBUTE SYSTEM "includeaddattribute.c">
|
---|
9 | <!ENTITY GETATTRIBUTE SYSTEM "includegetattribute.c">
|
---|
10 | <!ENTITY CONVERT SYSTEM "includeconvert.c">
|
---|
11 | ]>
|
---|
12 | <article lang="en">
|
---|
13 | <articleinfo>
|
---|
14 | <title>Libxml Tutorial</title>
|
---|
15 | <author>
|
---|
16 | <firstname>John</firstname>
|
---|
17 | <surname>Fleck</surname>
|
---|
18 | <email>[email protected]</email>
|
---|
19 | </author>
|
---|
20 | <copyright>
|
---|
21 | <year>2002, 2003</year>
|
---|
22 | <holder>John Fleck</holder>
|
---|
23 | </copyright>
|
---|
24 | <revhistory>
|
---|
25 | <revision>
|
---|
26 | <revnumber>1</revnumber>
|
---|
27 | <date>June 4, 2002</date>
|
---|
28 | <revremark>Initial draft</revremark>
|
---|
29 | </revision>
|
---|
30 | <revision>
|
---|
31 | <revnumber>2</revnumber>
|
---|
32 | <date>June 12, 2002</date>
|
---|
33 | <revremark>retrieving attribute value added</revremark>
|
---|
34 | </revision>
|
---|
35 | <revision>
|
---|
36 | <revnumber>3</revnumber>
|
---|
37 | <date>Aug. 31, 2002</date>
|
---|
38 | <revremark>freeing memory fix</revremark>
|
---|
39 | </revision>
|
---|
40 | <revision>
|
---|
41 | <revnumber>4</revnumber>
|
---|
42 | <date>Nov. 10, 2002</date>
|
---|
43 | <revremark>encoding discussion added</revremark>
|
---|
44 | </revision>
|
---|
45 | <revision>
|
---|
46 | <revnumber>5</revnumber>
|
---|
47 | <date>Dec. 15, 2002</date>
|
---|
48 | <revremark>more memory freeing changes</revremark>
|
---|
49 | </revision>
|
---|
50 | <revision>
|
---|
51 | <revnumber>6</revnumber>
|
---|
52 | <date>Jan. 26. 2003</date>
|
---|
53 | <revremark>add index</revremark>
|
---|
54 | </revision>
|
---|
55 | <revision>
|
---|
56 | <revnumber>7</revnumber>
|
---|
57 | <date>April 25, 2003</date>
|
---|
58 | <revremark>add compilation appendix</revremark>
|
---|
59 | </revision>
|
---|
60 | <revision>
|
---|
61 | <revnumber>8</revnumber>
|
---|
62 | <date>July 24, 2003</date>
|
---|
63 | <revremark>add XPath example</revremark>
|
---|
64 | </revision>
|
---|
65 | <revision>
|
---|
66 | <revnumber>9</revnumber>
|
---|
67 | <date>Feb. 14, 2004</date>
|
---|
68 | <revremark>Fix bug in XPath example</revremark>
|
---|
69 | </revision>
|
---|
70 | <revision>
|
---|
71 | <revnumber>7</revnumber>
|
---|
72 | <date>Aug. 24, 2004</date>
|
---|
73 | <revremark>Fix another bug in XPath example</revremark>
|
---|
74 | </revision>
|
---|
75 | </revhistory>
|
---|
76 | </articleinfo>
|
---|
77 | <abstract>
|
---|
78 | <para>Libxml is a freely licensed C language library for handling
|
---|
79 | <acronym>XML</acronym>, portable across a large number of platforms. This
|
---|
80 | tutorial provides examples of its basic functions.</para>
|
---|
81 | </abstract>
|
---|
82 | <sect1 id="introduction">
|
---|
83 | <title>Introduction</title>
|
---|
84 | <para>Libxml is a C language library implementing functions for reading,
|
---|
85 | creating and manipulating <acronym>XML</acronym> data. This tutorial
|
---|
86 | provides example code and explanations of its basic functionality.</para>
|
---|
87 | <para>Libxml and more details about its use are available on <ulink
|
---|
88 | url="https://gitlab.gnome.org/GNOME/libxml2">the project home page</ulink>. Included there is complete <ulink url="https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/general.html">
|
---|
89 | <acronym>API</acronym> documentation</ulink>. This tutorial is not meant
|
---|
90 | to substitute for that complete documentation, but to illustrate the
|
---|
91 | functions needed to use the library to perform basic operations.
|
---|
92 | <!--
|
---|
93 | Links to
|
---|
94 | other resources can be found in <xref linkend="furtherresources" />.
|
---|
95 | -->
|
---|
96 | </para>
|
---|
97 | <para>The tutorial is based on a simple <acronym>XML</acronym> application I
|
---|
98 | use for articles I write. The format includes metadata and the body
|
---|
99 | of the article.</para>
|
---|
100 | <para>The example code in this tutorial demonstrates how to:
|
---|
101 | <itemizedlist>
|
---|
102 | <listitem>
|
---|
103 | <para>Parse the document.</para>
|
---|
104 | </listitem>
|
---|
105 | <listitem>
|
---|
106 | <para>Extract the text within a specified element.</para>
|
---|
107 | </listitem>
|
---|
108 | <listitem>
|
---|
109 | <para>Add an element and its content.</para>
|
---|
110 | </listitem>
|
---|
111 | <listitem>
|
---|
112 | <para>Add an attribute.</para>
|
---|
113 | </listitem>
|
---|
114 | <listitem>
|
---|
115 | <para>Extract the value of an attribute.</para>
|
---|
116 | </listitem>
|
---|
117 | </itemizedlist>
|
---|
118 | </para>
|
---|
119 | <para>Full code for the examples is included in the appendices.</para>
|
---|
120 |
|
---|
121 | </sect1>
|
---|
122 |
|
---|
123 | <sect1 id="xmltutorialdatatypes">
|
---|
124 | <title>Data Types</title>
|
---|
125 | <para><application>Libxml</application> declares a number of data types we
|
---|
126 | will encounter repeatedly, hiding the messy stuff so you do not have to deal
|
---|
127 | with it unless you have some specific need.</para>
|
---|
128 | <para>
|
---|
129 | <variablelist>
|
---|
130 | <varlistentry>
|
---|
131 | <term><indexterm>
|
---|
132 | <primary>xmlChar</primary>
|
---|
133 | </indexterm>
|
---|
134 | <ulink
|
---|
135 | url="https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/libxml2-tree.html#XMLCHAR">xmlChar</ulink></term>
|
---|
136 | <listitem>
|
---|
137 | <para>A basic replacement for char, a byte in a UTF-8 encoded
|
---|
138 | string. If your data uses another encoding, it must be converted to
|
---|
139 | UTF-8 for use with <application>libxml's</application>
|
---|
140 | functions. More information on encoding is available on the <ulink
|
---|
141 | url="https://gitlab.gnome.org/GNOME/libxml2/-/wikis/Encodings-support"><application>libxml</application> encoding support web page</ulink>.</para>
|
---|
142 | </listitem>
|
---|
143 | </varlistentry>
|
---|
144 | <varlistentry>
|
---|
145 | <term><indexterm>
|
---|
146 | <primary>xmlDoc</primary>
|
---|
147 | </indexterm>
|
---|
148 | <ulink url="https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/libxml2-tree.html#XMLDOC">xmlDoc</ulink></term>
|
---|
149 | <listitem>
|
---|
150 | <para>A structure containing the tree created by a parsed doc. <ulink
|
---|
151 | url="https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/libxml2-tree.html#XMLDOCPTR">xmlDocPtr</ulink>
|
---|
152 | is a pointer to the structure.</para>
|
---|
153 | </listitem>
|
---|
154 | </varlistentry>
|
---|
155 | <varlistentry>
|
---|
156 | <term><indexterm>
|
---|
157 | <primary>xmlNodePtr</primary>
|
---|
158 | </indexterm>
|
---|
159 | <ulink
|
---|
160 | url="https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/libxml2-tree.html#XMLNODEPTR">xmlNodePtr</ulink>
|
---|
161 | and <ulink url="https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/libxml2-tree.html#XMLNODE">xmlNode</ulink></term>
|
---|
162 | <listitem>
|
---|
163 | <para>A structure containing a single node. <ulink
|
---|
164 | url="https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/libxml2-tree.html#XMLNODEPTR">xmlNodePtr</ulink>
|
---|
165 | is a pointer to the structure, and is used in traversing the document tree.</para>
|
---|
166 | </listitem>
|
---|
167 | </varlistentry>
|
---|
168 | </variablelist>
|
---|
169 | </para>
|
---|
170 |
|
---|
171 | </sect1>
|
---|
172 |
|
---|
173 | <sect1 id="xmltutorialparsing">
|
---|
174 | <title>Parsing the file</title>
|
---|
175 | <para><indexterm id="fileparsing" class="startofrange">
|
---|
176 | <primary>file</primary>
|
---|
177 | <secondary>parsing</secondary>
|
---|
178 | </indexterm>
|
---|
179 | Parsing the file requires only the name of the file and a single
|
---|
180 | function call, plus error checking. Full code: <xref
|
---|
181 | linkend="keywordappendix" /></para>
|
---|
182 | <para>
|
---|
183 | <programlisting>
|
---|
184 | <co id="declaredoc" /> xmlDocPtr doc;
|
---|
185 | <co id="declarenode" /> xmlNodePtr cur;
|
---|
186 |
|
---|
187 | <co id="parsefile" /> doc = xmlParseFile(docname);
|
---|
188 |
|
---|
189 | <co id="checkparseerror" /> if (doc == NULL ) {
|
---|
190 | fprintf(stderr,"Document not parsed successfully. \n");
|
---|
191 | return;
|
---|
192 | }
|
---|
193 |
|
---|
194 | <co id="getrootelement" /> cur = xmlDocGetRootElement(doc);
|
---|
195 |
|
---|
196 | <co id="checkemptyerror" /> if (cur == NULL) {
|
---|
197 | fprintf(stderr,"empty document\n");
|
---|
198 | xmlFreeDoc(doc);
|
---|
199 | return;
|
---|
200 | }
|
---|
201 |
|
---|
202 | <co id="checkroottype" /> if (xmlStrcmp(cur->name, (const xmlChar *) "story")) {
|
---|
203 | fprintf(stderr,"document of the wrong type, root node != story");
|
---|
204 | xmlFreeDoc(doc);
|
---|
205 | return;
|
---|
206 | }
|
---|
207 |
|
---|
208 | </programlisting>
|
---|
209 | <calloutlist>
|
---|
210 | <callout arearefs="declaredoc">
|
---|
211 | <para>Declare the pointer that will point to your parsed document.</para>
|
---|
212 | </callout>
|
---|
213 | <callout arearefs="declarenode">
|
---|
214 | <para>Declare a node pointer (you'll need this in order to
|
---|
215 | interact with individual nodes).</para>
|
---|
216 | </callout>
|
---|
217 | <callout arearefs="checkparseerror">
|
---|
218 | <para>Check to see that the document was successfully parsed. If it
|
---|
219 | was not, <application>libxml</application> will at this point
|
---|
220 | register an error and stop.
|
---|
221 | <note>
|
---|
222 | <para><indexterm>
|
---|
223 | <primary>encoding</primary>
|
---|
224 | </indexterm>
|
---|
225 | One common example of an error at this point is improper
|
---|
226 | handling of encoding. The <acronym>XML</acronym> standard requires
|
---|
227 | documents stored with an encoding other than UTF-8 or UTF-16 to
|
---|
228 | contain an explicit declaration of their encoding. If the
|
---|
229 | declaration is there, <application>libxml</application> will
|
---|
230 | automatically perform the necessary conversion to UTF-8 for
|
---|
231 | you. More information on <acronym>XML's</acronym> encoding
|
---|
232 | requirements is contained in the <ulink
|
---|
233 | url="http://www.w3.org/TR/REC-xml#charencoding">standard</ulink>.</para>
|
---|
234 | </note>
|
---|
235 | </para>
|
---|
236 | </callout>
|
---|
237 | <callout arearefs="getrootelement">
|
---|
238 | <para>Retrieve the document's root element.</para>
|
---|
239 | </callout>
|
---|
240 | <callout arearefs="checkemptyerror">
|
---|
241 | <para>Check to make sure the document actually contains something.</para>
|
---|
242 | </callout>
|
---|
243 | <callout arearefs="checkroottype">
|
---|
244 | <para>In our case, we need to make sure the document is the right
|
---|
245 | type. "story" is the root type of the documents used in this
|
---|
246 | tutorial.</para>
|
---|
247 | </callout>
|
---|
248 | </calloutlist>
|
---|
249 | <indexterm startref="fileparsing" class="endofrange" />
|
---|
250 | </para>
|
---|
251 | </sect1>
|
---|
252 |
|
---|
253 | <sect1 id="xmltutorialgettext">
|
---|
254 | <title>Retrieving Element Content</title>
|
---|
255 | <para><indexterm>
|
---|
256 | <primary>element</primary>
|
---|
257 | <secondary>retrieving content</secondary>
|
---|
258 | </indexterm>
|
---|
259 | Retrieving the content of an element involves traversing the document
|
---|
260 | tree until you find what you are looking for. In this case, we are looking
|
---|
261 | for an element called "keyword" contained within element called "story". The
|
---|
262 | process to find the node we are interested in involves tediously walking the
|
---|
263 | tree. We assume you already have an xmlDocPtr called <varname>doc</varname>
|
---|
264 | and an xmlNodPtr called <varname>cur</varname>.</para>
|
---|
265 |
|
---|
266 | <para>
|
---|
267 | <programlisting>
|
---|
268 | <co id="getchildnode" />cur = cur->xmlChildrenNode;
|
---|
269 | <co id="huntstoryinfo" />while (cur != NULL) {
|
---|
270 | if ((!xmlStrcmp(cur->name, (const xmlChar *)"storyinfo"))){
|
---|
271 | parseStory (doc, cur);
|
---|
272 | }
|
---|
273 |
|
---|
274 | cur = cur->next;
|
---|
275 | }
|
---|
276 | </programlisting>
|
---|
277 |
|
---|
278 | <calloutlist>
|
---|
279 | <callout arearefs="getchildnode">
|
---|
280 | <para>Get the first child node of <varname>cur</varname>. At this
|
---|
281 | point, <varname>cur</varname> points at the document root, which is
|
---|
282 | the element "story".</para>
|
---|
283 | </callout>
|
---|
284 | <callout arearefs="huntstoryinfo">
|
---|
285 | <para>This loop iterates through the elements that are children of
|
---|
286 | "story", looking for one called "storyinfo". That
|
---|
287 | is the element that will contain the "keywords" we are
|
---|
288 | looking for. It uses the <application>libxml</application> string
|
---|
289 | comparison
|
---|
290 | function, <function><ulink
|
---|
291 | url="https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/libxml2-parser.html#XMLSTRCMP">xmlStrcmp</ulink></function>. If there is a match, it calls the function <function>parseStory</function>.</para>
|
---|
292 | </callout>
|
---|
293 | </calloutlist>
|
---|
294 | </para>
|
---|
295 |
|
---|
296 | <para>
|
---|
297 | <programlisting>
|
---|
298 | void
|
---|
299 | parseStory (xmlDocPtr doc, xmlNodePtr cur) {
|
---|
300 |
|
---|
301 | xmlChar *key;
|
---|
302 | <co id="anothergetchild" /> cur = cur->xmlChildrenNode;
|
---|
303 | <co id="findkeyword" /> while (cur != NULL) {
|
---|
304 | if ((!xmlStrcmp(cur->name, (const xmlChar *)"keyword"))) {
|
---|
305 | <co id="foundkeyword" /> key = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
|
---|
306 | printf("keyword: %s\n", key);
|
---|
307 | xmlFree(key);
|
---|
308 | }
|
---|
309 | cur = cur->next;
|
---|
310 | }
|
---|
311 | return;
|
---|
312 | }
|
---|
313 | </programlisting>
|
---|
314 | <calloutlist>
|
---|
315 | <callout arearefs="anothergetchild">
|
---|
316 | <para>Again we get the first child node.</para>
|
---|
317 | </callout>
|
---|
318 | <callout arearefs="findkeyword">
|
---|
319 | <para>Like the loop above, we then iterate through the nodes, looking
|
---|
320 | for one that matches the element we're interested in, in this case
|
---|
321 | "keyword".</para>
|
---|
322 | </callout>
|
---|
323 | <callout arearefs="foundkeyword">
|
---|
324 | <para>When we find the "keyword" element, we need to print
|
---|
325 | its contents. Remember that in <acronym>XML</acronym>, the text
|
---|
326 | contained within an element is a child node of that element, so we
|
---|
327 | turn to <varname>cur->xmlChildrenNode</varname>. To retrieve it, we
|
---|
328 | use the function <function><ulink
|
---|
329 | url="https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/libxml2-tree.html#XMLNODELISTGETSTRING">xmlNodeListGetString</ulink></function>, which also takes the <varname>doc</varname> pointer as an argument. In this case, we just print it out.</para>
|
---|
330 | <note>
|
---|
331 | <para>Because <function>xmlNodeListGetString</function> allocates
|
---|
332 | memory for the string it returns, you must use
|
---|
333 | <function>xmlFree</function> to free it.</para>
|
---|
334 | </note>
|
---|
335 | </callout>
|
---|
336 | </calloutlist>
|
---|
337 | </para>
|
---|
338 |
|
---|
339 | </sect1>
|
---|
340 | <sect1 id="xmltutorialxpath">
|
---|
341 | <title>Using XPath to Retrieve Element Content</title>
|
---|
342 | <para>In addition to walking the document tree to find an element,
|
---|
343 | <application>Libxml2</application> includes support for
|
---|
344 | use of <application>XPath</application> expressions to retrieve sets of
|
---|
345 | nodes that match a specified criteria. Full documentation of the
|
---|
346 | <application>XPath</application> <acronym>API</acronym> is <ulink
|
---|
347 | url="https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/libxml2-xpath.html">here</ulink>.
|
---|
348 | </para>
|
---|
349 | <para><application>XPath</application> allows searching through a document
|
---|
350 | for nodes that match specified criteria. In the example below we search
|
---|
351 | through a document for the contents of all <varname>keyword</varname>
|
---|
352 | elements.
|
---|
353 | <note>
|
---|
354 | <para>A full discussion of <application>XPath</application> is beyond
|
---|
355 | the scope of this document. For details on its use, see the <ulink
|
---|
356 | url="http://www.w3.org/TR/xpath">XPath specification</ulink>.</para>
|
---|
357 | </note>
|
---|
358 | Full code for this example is at <xref linkend="xpathappendix" />.
|
---|
359 | </para>
|
---|
360 | <para>Using <application>XPath</application> requires setting up an
|
---|
361 | xmlXPathContext and then supplying the <application>XPath</application>
|
---|
362 | expression and the context to the
|
---|
363 | <function>xmlXPathEvalExpression</function> function. The function returns
|
---|
364 | an xmlXPathObjectPtr, which includes the set of nodes satisfying the
|
---|
365 | <application>XPath</application> expression.</para>
|
---|
366 | <para>
|
---|
367 | <programlisting>
|
---|
368 | xmlXPathObjectPtr
|
---|
369 | getnodeset (xmlDocPtr doc, xmlChar *xpath){
|
---|
370 |
|
---|
371 | <co id="cocontext" />xmlXPathContextPtr context;
|
---|
372 | xmlXPathObjectPtr result;
|
---|
373 |
|
---|
374 | <co id="cocreatecontext" />context = xmlXPathNewContext(doc);
|
---|
375 | <co id="corunxpath" />result = xmlXPathEvalExpression(xpath, context);
|
---|
376 | <co id="cocheckxpathresult" />if(xmlXPathNodeSetIsEmpty(result->nodesetval)){
|
---|
377 | xmlXPathFreeObject(result);
|
---|
378 | printf("No result\n");
|
---|
379 | return NULL;
|
---|
380 | </programlisting>
|
---|
381 | <calloutlist>
|
---|
382 | <callout arearefs="cocontext">
|
---|
383 | <para>First we declare our variables.</para>
|
---|
384 | </callout>
|
---|
385 | <callout arearefs="cocreatecontext">
|
---|
386 | <para>Initialize the <varname>context</varname> variable.</para>
|
---|
387 | </callout>
|
---|
388 | <callout arearefs="corunxpath">
|
---|
389 | <para>Apply the <application>XPath</application> expression.</para>
|
---|
390 | </callout>
|
---|
391 | <callout arearefs="cocheckxpathresult">
|
---|
392 | <para>Check the result and free the memory allocated to
|
---|
393 | <varname>result</varname> if no result is found.</para>
|
---|
394 | </callout>
|
---|
395 | </calloutlist>
|
---|
396 | </para>
|
---|
397 | <para>The xmlPathObjectPtr returned by the function contains a set of nodes
|
---|
398 | and other information needed to iterate through the set and act on the
|
---|
399 | results. For this example, our functions returns the
|
---|
400 | <varname>xmlXPathObjectPtr</varname>. We use it to print the contents of
|
---|
401 | <varname>keyword</varname> nodes in our document. The node set object
|
---|
402 | includes the number of elements in the set (<varname>nodeNr</varname>) and
|
---|
403 | an array of nodes (<varname>nodeTab</varname>):
|
---|
404 | <programlisting>
|
---|
405 | <co id="conodesetcounter" />for (i=0; i < nodeset->nodeNr; i++) {
|
---|
406 | <co id="coprintkeywords" />keyword = xmlNodeListGetString(doc, nodeset->nodeTab[i]->xmlChildrenNode, 1);
|
---|
407 | printf("keyword: %s\n", keyword);
|
---|
408 | xmlFree(keyword);
|
---|
409 | }
|
---|
410 | </programlisting>
|
---|
411 | <calloutlist>
|
---|
412 | <callout arearefs="conodesetcounter">
|
---|
413 | <para>The value of <varname>nodeset->Nr</varname> holds the number of
|
---|
414 | elements in the node set. Here we use it to iterate through the array.</para>
|
---|
415 | </callout>
|
---|
416 | <callout arearefs="coprintkeywords">
|
---|
417 | <para>Here we print the contents of each of the nodes returned.
|
---|
418 | <note>
|
---|
419 | <para>Note that we are printing the child node of the node that is
|
---|
420 | returned, because the contents of the <varname>keyword</varname>
|
---|
421 | element are a child text node.</para>
|
---|
422 | </note>
|
---|
423 | </para>
|
---|
424 | </callout>
|
---|
425 | </calloutlist>
|
---|
426 | </para>
|
---|
427 | </sect1>
|
---|
428 | <sect1 id="xmltutorialwritingcontent">
|
---|
429 | <title>Writing element content</title>
|
---|
430 | <para><indexterm>
|
---|
431 | <primary>element</primary>
|
---|
432 | <secondary>writing content</secondary>
|
---|
433 | </indexterm>
|
---|
434 | Writing element content uses many of the same steps we used above
|
---|
435 | — parsing the document and walking the tree. We parse the document,
|
---|
436 | then traverse the tree to find the place we want to insert our element. For
|
---|
437 | this example, we want to again find the "storyinfo" element and
|
---|
438 | this time insert a keyword. Then we'll write the file to disk. Full code:
|
---|
439 | <xref linkend="addkeywordappendix" /></para>
|
---|
440 | <para>
|
---|
441 | The main difference in this example is in
|
---|
442 | <function>parseStory</function>:
|
---|
443 |
|
---|
444 | <programlisting>
|
---|
445 | void
|
---|
446 | parseStory (xmlDocPtr doc, xmlNodePtr cur, char *keyword) {
|
---|
447 |
|
---|
448 | <co id="addkeyword" /> xmlNewTextChild (cur, NULL, "keyword", keyword);
|
---|
449 | return;
|
---|
450 | }
|
---|
451 | </programlisting>
|
---|
452 | <calloutlist>
|
---|
453 | <callout arearefs="addkeyword">
|
---|
454 | <para>The <function><ulink
|
---|
455 | url="https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/libxml2-tree.html#XMLNEWTEXTCHILD">xmlNewTextChild</ulink></function>
|
---|
456 | function adds a new child element at the
|
---|
457 | current node pointer's location in the
|
---|
458 | tree, specified by <varname>cur</varname>.</para>
|
---|
459 | </callout>
|
---|
460 | </calloutlist>
|
---|
461 | </para>
|
---|
462 |
|
---|
463 | <para>
|
---|
464 | <indexterm>
|
---|
465 | <primary>file</primary>
|
---|
466 | <secondary>saving</secondary>
|
---|
467 | </indexterm>
|
---|
468 | Once the node has been added, we would like to write the document to
|
---|
469 | file. Is you want the element to have a namespace, you can add it here as
|
---|
470 | well. In our case, the namespace is NULL.
|
---|
471 | <programlisting>
|
---|
472 | xmlSaveFormatFile (docname, doc, 1);
|
---|
473 | </programlisting>
|
---|
474 | The first parameter is the name of the file to be written. You'll notice
|
---|
475 | it is the same as the file we just read. In this case, we just write over
|
---|
476 | the old file. The second parameter is a pointer to the xmlDoc
|
---|
477 | structure. Setting the third parameter equal to one ensures indenting on output.
|
---|
478 | </para>
|
---|
479 | </sect1>
|
---|
480 |
|
---|
481 | <sect1 id="xmltutorialwritingattribute">
|
---|
482 | <title>Writing Attribute</title>
|
---|
483 | <para><indexterm>
|
---|
484 | <primary>attribute</primary>
|
---|
485 | <secondary>writing</secondary>
|
---|
486 | </indexterm>
|
---|
487 | Writing an attribute is similar to writing text to a new element. In
|
---|
488 | this case, we'll add a reference <acronym>URI</acronym> to our
|
---|
489 | document. Full code:<xref linkend="addattributeappendix" />.</para>
|
---|
490 | <para>
|
---|
491 | A <sgmltag>reference</sgmltag> is a child of the <sgmltag>story</sgmltag>
|
---|
492 | element, so finding the place to put our new element and attribute is
|
---|
493 | simple. As soon as we do the error-checking test in our
|
---|
494 | <function>parseDoc</function>, we are in the right spot to add our
|
---|
495 | element. But before we do that, we need to make a declaration using a
|
---|
496 | data type we have not seen yet:
|
---|
497 | <programlisting>
|
---|
498 | xmlAttrPtr newattr;
|
---|
499 | </programlisting>
|
---|
500 | We also need an extra xmlNodePtr:
|
---|
501 | <programlisting>
|
---|
502 | xmlNodePtr newnode;
|
---|
503 | </programlisting>
|
---|
504 | </para>
|
---|
505 | <para>
|
---|
506 | The rest of <function>parseDoc</function> is the same as before until we
|
---|
507 | check to see if our root element is <sgmltag>story</sgmltag>. If it is,
|
---|
508 | then we know we are at the right spot to add our element:
|
---|
509 |
|
---|
510 | <programlisting>
|
---|
511 | <co id="addreferencenode" /> newnode = xmlNewTextChild (cur, NULL, "reference", NULL);
|
---|
512 | <co id="addattributenode" /> newattr = xmlNewProp (newnode, "uri", uri);
|
---|
513 | </programlisting>
|
---|
514 | <calloutlist>
|
---|
515 | <callout arearefs="addreferencenode">
|
---|
516 | <para>First we add a new node at the location of the current node
|
---|
517 | pointer, <varname>cur.</varname> using the <ulink
|
---|
518 | url="https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/libxml2-tree.html#XMLNEWTEXTCHILD">xmlNewTextChild</ulink> function.</para>
|
---|
519 | </callout>
|
---|
520 | </calloutlist>
|
---|
521 | </para>
|
---|
522 |
|
---|
523 | <para>Once the node is added, the file is written to disk just as in the
|
---|
524 | previous example in which we added an element with text content.</para>
|
---|
525 |
|
---|
526 | </sect1>
|
---|
527 |
|
---|
528 | <sect1 id="xmltutorialattribute">
|
---|
529 | <title>Retrieving Attributes</title>
|
---|
530 | <para><indexterm>
|
---|
531 | <primary>attribute</primary>
|
---|
532 | <secondary>retrieving value</secondary>
|
---|
533 | </indexterm>
|
---|
534 | Retrieving the value of an attribute is similar to the previous
|
---|
535 | example in which we retrieved a node's text contents. In this case we'll
|
---|
536 | extract the value of the <acronym>URI</acronym> we added in the previous
|
---|
537 | section. Full code: <xref linkend="getattributeappendix" />.</para>
|
---|
538 | <para>
|
---|
539 | The initial steps for this example are similar to the previous ones: parse
|
---|
540 | the doc, find the element you are interested in, then enter a function to
|
---|
541 | carry out the specific task required. In this case, we call
|
---|
542 | <function>getReference</function>:
|
---|
543 | <programlisting>
|
---|
544 | void
|
---|
545 | getReference (xmlDocPtr doc, xmlNodePtr cur) {
|
---|
546 |
|
---|
547 | xmlChar *uri;
|
---|
548 | cur = cur->xmlChildrenNode;
|
---|
549 | while (cur != NULL) {
|
---|
550 | if ((!xmlStrcmp(cur->name, (const xmlChar *)"reference"))) {
|
---|
551 | <co id="getattributevalue" /> uri = xmlGetProp(cur, "uri");
|
---|
552 | printf("uri: %s\n", uri);
|
---|
553 | xmlFree(uri);
|
---|
554 | }
|
---|
555 | cur = cur->next;
|
---|
556 | }
|
---|
557 | return;
|
---|
558 | }
|
---|
559 | </programlisting>
|
---|
560 |
|
---|
561 | <calloutlist>
|
---|
562 | <callout arearefs="getattributevalue">
|
---|
563 | <para>
|
---|
564 | The key function is <function><ulink
|
---|
565 | url="https://gnome.pages.gitlab.gnome.org/libxml2/devhelp/libxml2-tree.html#XMLGETPROP">xmlGetProp</ulink></function>, which returns an
|
---|
566 | <varname>xmlChar</varname> containing the attribute's value. In this case,
|
---|
567 | we just print it out.
|
---|
568 | <note>
|
---|
569 | <para>
|
---|
570 | If you are using a <acronym>DTD</acronym> that declares a fixed or
|
---|
571 | default value for the attribute, this function will retrieve it.
|
---|
572 | </para>
|
---|
573 | </note>
|
---|
574 | </para>
|
---|
575 | </callout>
|
---|
576 | </calloutlist>
|
---|
577 |
|
---|
578 | </para>
|
---|
579 | </sect1>
|
---|
580 |
|
---|
581 | <sect1 id="xmltutorialconvert">
|
---|
582 | <title>Encoding Conversion</title>
|
---|
583 |
|
---|
584 | <para><indexterm>
|
---|
585 | <primary>encoding</primary>
|
---|
586 | </indexterm>
|
---|
587 | Data encoding compatibility problems are one of the most common
|
---|
588 | difficulties encountered by programmers new to <acronym>XML</acronym> in
|
---|
589 | general and <application>libxml</application> in particular. Thinking
|
---|
590 | through the design of your application in light of this issue will help
|
---|
591 | avoid difficulties later. Internally, <application>libxml</application>
|
---|
592 | stores and manipulates data in the UTF-8 format. Data used by your program
|
---|
593 | in other formats, such as the commonly used ISO-8859-1 encoding, must be
|
---|
594 | converted to UTF-8 before passing it to <application>libxml</application>
|
---|
595 | functions. If you want your program's output in an encoding other than
|
---|
596 | UTF-8, you also must convert it.</para>
|
---|
597 |
|
---|
598 | <para><application>Libxml</application> uses
|
---|
599 | <application>iconv</application> if it is available to convert
|
---|
600 | data. Without <application>iconv</application>, only UTF-8, UTF-16 and
|
---|
601 | ISO-8859-1 can be used as external formats. With
|
---|
602 | <application>iconv</application>, any format can be used provided
|
---|
603 | <application>iconv</application> is able to convert it to and from
|
---|
604 | UTF-8. Currently <application>iconv</application> supports about 150
|
---|
605 | different character formats with ability to convert from any to any. While
|
---|
606 | the actual number of supported formats varies between implementations, every
|
---|
607 | <application>iconv</application> implementation is almost guaranteed to
|
---|
608 | support every format anyone has ever heard of.</para>
|
---|
609 |
|
---|
610 | <warning>
|
---|
611 | <para>A common mistake is to use different formats for the internal data
|
---|
612 | in different parts of one's code. The most common case is an application
|
---|
613 | that assumes ISO-8859-1 to be the internal data format, combined with
|
---|
614 | <application>libxml</application>, which assumes UTF-8 to be the
|
---|
615 | internal data format. The result is an application that treats internal
|
---|
616 | data differently, depending on which code section is executing. The one or
|
---|
617 | the other part of code will then, naturally, misinterpret the data.
|
---|
618 | </para>
|
---|
619 | </warning>
|
---|
620 |
|
---|
621 | <para>This example constructs a simple document, then adds content provided
|
---|
622 | at the command line to the document's root element and outputs the results
|
---|
623 | to <filename>stdout</filename> in the proper encoding. For this example, we
|
---|
624 | use ISO-8859-1 encoding. The encoding of the string input at the command
|
---|
625 | line is converted from ISO-8859-1 to UTF-8. Full code: <xref
|
---|
626 | linkend="convertappendix" /></para>
|
---|
627 |
|
---|
628 | <para>The conversion, encapsulated in the example code in the
|
---|
629 | <function>convert</function> function, uses
|
---|
630 | <application>libxml's</application>
|
---|
631 | <function>xmlFindCharEncodingHandler</function> function:
|
---|
632 | <programlisting>
|
---|
633 | <co id="handlerdatatype" />xmlCharEncodingHandlerPtr handler;
|
---|
634 | <co id="calcsize" />size = (int)strlen(in)+1;
|
---|
635 | out_size = size*2-1;
|
---|
636 | out = malloc((size_t)out_size);
|
---|
637 |
|
---|
638 | …
|
---|
639 | <co id="findhandlerfunction" />handler = xmlFindCharEncodingHandler(encoding);
|
---|
640 | …
|
---|
641 | <co id="callconversionfunction" />handler->input(out, &out_size, in, &temp);
|
---|
642 | …
|
---|
643 | <co id="outputencoding" />xmlSaveFormatFileEnc("-", doc, encoding, 1);
|
---|
644 | </programlisting>
|
---|
645 | <calloutlist>
|
---|
646 | <callout arearefs="handlerdatatype">
|
---|
647 | <para><varname>handler</varname> is declared as a pointer to an
|
---|
648 | <function>xmlCharEncodingHandler</function> function.</para>
|
---|
649 | </callout>
|
---|
650 | <callout arearefs="calcsize">
|
---|
651 | <para>The <function>xmlCharEncodingHandler</function> function needs
|
---|
652 | to be given the size of the input and output strings, which are
|
---|
653 | calculated here for strings <varname>in</varname> and
|
---|
654 | <varname>out</varname>.</para>
|
---|
655 | </callout>
|
---|
656 | <callout arearefs="findhandlerfunction">
|
---|
657 | <para><function>xmlFindCharEncodingHandler</function> takes as its
|
---|
658 | argument the data's initial encoding and searches
|
---|
659 | <application>libxml's</application> built-in set of conversion
|
---|
660 | handlers, returning a pointer to the function or NULL if none is
|
---|
661 | found.</para>
|
---|
662 | </callout>
|
---|
663 | <callout arearefs="callconversionfunction">
|
---|
664 | <para>The conversion function identified by <varname>handler</varname>
|
---|
665 | requires as its arguments pointers to the input and output strings,
|
---|
666 | along with the length of each. The lengths must be determined
|
---|
667 | separately by the application.</para>
|
---|
668 | </callout>
|
---|
669 | <callout arearefs="outputencoding">
|
---|
670 | <para>To output in a specified encoding rather than UTF-8, we use
|
---|
671 | <function>xmlSaveFormatFileEnc</function>, specifying the
|
---|
672 | encoding.</para>
|
---|
673 | </callout>
|
---|
674 | </calloutlist>
|
---|
675 | </para>
|
---|
676 | </sect1>
|
---|
677 |
|
---|
678 | <appendix id="compilation">
|
---|
679 | <title>Compilation</title>
|
---|
680 | <para><indexterm>
|
---|
681 | <primary>compiler flags</primary>
|
---|
682 | </indexterm>
|
---|
683 | <application>Libxml</application> includes a script,
|
---|
684 | <application>xml2-config</application>, that can be used to generate
|
---|
685 | flags for compilation and linking of programs written with the
|
---|
686 | library. For pre-processor and compiler flags, use <command>xml2-config
|
---|
687 | --cflags</command>. For library linking flags, use <command>xml2-config
|
---|
688 | --libs</command>. Other options are available using <command>xml2-config
|
---|
689 | --help</command>.</para>
|
---|
690 | </appendix>
|
---|
691 |
|
---|
692 | <appendix id="sampledoc">
|
---|
693 | <title>Sample Document</title>
|
---|
694 | <programlisting>&STORY;</programlisting>
|
---|
695 | </appendix>
|
---|
696 | <appendix id="keywordappendix">
|
---|
697 | <title>Code for Keyword Example</title>
|
---|
698 | <para>
|
---|
699 | <programlisting>&KEYWORD;</programlisting>
|
---|
700 | </para>
|
---|
701 | </appendix>
|
---|
702 | <appendix id="xpathappendix">
|
---|
703 | <title>Code for XPath Example</title>
|
---|
704 | <para>
|
---|
705 | <programlisting>&XPATH;</programlisting>
|
---|
706 | </para>
|
---|
707 | </appendix>
|
---|
708 | <appendix id="addkeywordappendix">
|
---|
709 | <title>Code for Add Keyword Example</title>
|
---|
710 | <para>
|
---|
711 | <programlisting>&ADDKEYWORD;</programlisting>
|
---|
712 | </para>
|
---|
713 | </appendix>
|
---|
714 | <appendix id="addattributeappendix">
|
---|
715 | <title>Code for Add Attribute Example</title>
|
---|
716 | <para>
|
---|
717 | <programlisting>&ADDATTRIBUTE;</programlisting>
|
---|
718 | </para>
|
---|
719 | </appendix>
|
---|
720 | <appendix id="getattributeappendix">
|
---|
721 | <title>Code for Retrieving Attribute Value Example</title>
|
---|
722 | <para>
|
---|
723 | <programlisting>&GETATTRIBUTE;</programlisting>
|
---|
724 | </para>
|
---|
725 | </appendix>
|
---|
726 | <appendix id="convertappendix">
|
---|
727 | <title>Code for Encoding Conversion Example</title>
|
---|
728 | <para>
|
---|
729 | <programlisting>&CONVERT;</programlisting>
|
---|
730 | </para>
|
---|
731 | </appendix>
|
---|
732 | <appendix>
|
---|
733 | <title>Acknowledgements</title>
|
---|
734 | <para>A number of people have generously offered feedback, code and
|
---|
735 | suggested improvements to this tutorial. In no particular order:
|
---|
736 | <simplelist type="inline">
|
---|
737 | <member>Daniel Veillard</member>
|
---|
738 | <member>Marcus Labib Iskander</member>
|
---|
739 | <member>Christopher R. Harris</member>
|
---|
740 | <member>Igor Zlatkovic</member>
|
---|
741 | <member>Niraj Tolia</member>
|
---|
742 | <member>David Turover</member>
|
---|
743 | </simplelist>
|
---|
744 | </para>
|
---|
745 | </appendix>
|
---|
746 | <index />
|
---|
747 | </article>
|
---|