genUnicode.py@ 9259

Last change on this file since 9259 was 6076, checked in by vboxsync, 17 years ago
Merged dmik/s2 branch (r25959:26751) to the trunk.
Property svn:eol-style set to `native` Property svn:keywords set to `Date Revision Author Id`
File size: 12.7 KB

Line
1	#!/usr/bin/python -u
2	#
3	# Original script modified in November 2003 to take advantage of
4	# the character-validation range routines, and updated to the
5	# current Unicode information (Version 4.0.1)
6	#
7	# NOTE: there is an 'alias' facility for blocks which are not present in
8	# the current release, but are needed for ABI compatibility. This
9	# must be accomplished MANUALLY! Please see the comments below under
10	# 'blockAliases'
11	#
12	import sys
13	import string
14	import time
15
16	webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
17	sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
18
19	#
20	# blockAliases is a small hack - it is used for mapping block names which
21	# were were used in the 3.1 release, but are missing or changed in the current
22	# release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
23	blockAliases = []
24	blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
25	blockAliases.append("Greek:GreekandCoptic")
26	blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
27	"SupplementaryPrivateUseArea-B")
28
29	# minTableSize gives the minimum number of ranges which must be present
30	# before a range table is produced. If there are less than this
31	# number, inline comparisons are generated
32	minTableSize = 8
33
34	(blockfile, catfile) = string.split(sources)
35
36
37	#
38	# Now process the "blocks" file, reducing it to a dictionary
39	# indexed by blockname, containing a tuple with the applicable
40	# block range
41	#
42	BlockNames = {}
43	try:
44	blocks = open(blockfile, "r")
45	except:
46	print "Missing %s, aborting ..." % blockfile
47	sys.exit(1)
48
49	for line in blocks.readlines():
50	if line[0] == '#':
51	continue
52	line = string.strip(line)
53	if line == '':
54	continue
55	try:
56	fields = string.split(line, ';')
57	range = string.strip(fields[0])
58	(start, end) = string.split(range, "..")
59	name = string.strip(fields[1])
60	name = string.replace(name, ' ', '')
61	except:
62	print "Failed to process line: %s" % (line)
63	continue
64	start = "0x" + start
65	end = "0x" + end
66	try:
67	BlockNames[name].append((start, end))
68	except:
69	BlockNames[name] = [(start, end)]
70	blocks.close()
71	print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
72
73	for block in blockAliases:
74	alias = string.split(block,':')
75	alist = string.split(alias[1],',')
76	for comp in alist:
77	if BlockNames.has_key(comp):
78	if alias[0] not in BlockNames:
79	BlockNames[alias[0]] = []
80	for r in BlockNames[comp]:
81	BlockNames[alias[0]].append(r)
82	else:
83	print "Alias %s: %s not in Blocks" % (alias[0], comp)
84	continue
85
86	#
87	# Next process the Categories file. This is more complex, since
88	# the file is in code sequence, and we need to invert it. We use
89	# a dictionary with index category-name, with each entry containing
90	# all the ranges (codepoints) of that category. Note that category
91	# names comprise two parts - the general category, and the "subclass"
92	# within that category. Therefore, both "general category" (which is
93	# the first character of the 2-character category-name) and the full
94	# (2-character) name are entered into this dictionary.
95	#
96	try:
97	data = open(catfile, "r")
98	except:
99	print "Missing %s, aborting ..." % catfile
100	sys.exit(1)
101
102	nbchar = 0;
103	Categories = {}
104	for line in data.readlines():
105	if line[0] == '#':
106	continue
107	line = string.strip(line)
108	if line == '':
109	continue
110	try:
111	fields = string.split(line, ';')
112	point = string.strip(fields[0])
113	value = 0
114	while point != '':
115	value = value * 16
116	if point[0] >= '0' and point[0] <= '9':
117	value = value + ord(point[0]) - ord('0')
118	elif point[0] >= 'A' and point[0] <= 'F':
119	value = value + 10 + ord(point[0]) - ord('A')
120	elif point[0] >= 'a' and point[0] <= 'f':
121	value = value + 10 + ord(point[0]) - ord('a')
122	point = point[1:]
123	name = fields[2]
124	except:
125	print "Failed to process line: %s" % (line)
126	continue
127
128	nbchar = nbchar + 1
129	# update entry for "full name"
130	try:
131	Categories[name].append(value)
132	except:
133	try:
134	Categories[name] = [value]
135	except:
136	print "Failed to process line: %s" % (line)
137	# update "general category" name
138	try:
139	Categories[name[0]].append(value)
140	except:
141	try:
142	Categories[name[0]] = [value]
143	except:
144	print "Failed to process line: %s" % (line)
145
146	blocks.close()
147	print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
148
149	#
150	# The data is now all read. Time to process it into a more useful form.
151	#
152	# reduce the number list into ranges
153	for cat in Categories.keys():
154	list = Categories[cat]
155	start = -1
156	prev = -1
157	end = -1
158	ranges = []
159	for val in list:
160	if start == -1:
161	start = val
162	prev = val
163	continue
164	elif val == prev + 1:
165	prev = val
166	continue
167	elif prev == start:
168	ranges.append((prev, prev))
169	start = val
170	prev = val
171	continue
172	else:
173	ranges.append((start, prev))
174	start = val
175	prev = val
176	continue
177	if prev == start:
178	ranges.append((prev, prev))
179	else:
180	ranges.append((start, prev))
181	Categories[cat] = ranges
182
183	#
184	# Assure all data is in alphabetic order, since we will be doing binary
185	# searches on the tables.
186	#
187	bkeys = BlockNames.keys()
188	bkeys.sort()
189
190	ckeys = Categories.keys()
191	ckeys.sort()
192
193	#
194	# Generate the resulting files
195	#
196	try:
197	header = open("include/libxml/xmlunicode.h", "w")
198	except:
199	print "Failed to open include/libxml/xmlunicode.h"
200	sys.exit(1)
201
202	try:
203	output = open("xmlunicode.c", "w")
204	except:
205	print "Failed to open xmlunicode.c"
206	sys.exit(1)
207
208	date = time.asctime(time.localtime(time.time()))
209
210	header.write(
211	"""/*
212	* Summary: Unicode character APIs
213	* Description: API for the Unicode character APIs
214	*
215	* This file is automatically generated from the
216	* UCS description files of the Unicode Character Database
217	* %s
218	* using the genUnicode.py Python script.
219	*
220	* Generation date: %s
221	* Sources: %s
222	* Author: Daniel Veillard
223	*/
224
225	#ifndef __XML_UNICODE_H__
226	#define __XML_UNICODE_H__
227
228	#include <libxml/xmlversion.h>
229
230	#ifdef LIBXML_UNICODE_ENABLED
231
232	#ifdef __cplusplus
233	extern "C" {
234	#endif
235
236	""" % (webpage, date, sources));
237
238	output.write(
239	"""/*
240	* xmlunicode.c: this module implements the Unicode character APIs
241	*
242	* This file is automatically generated from the
243	* UCS description files of the Unicode Character Database
244	* %s
245	* using the genUnicode.py Python script.
246	*
247	* Generation date: %s
248	* Sources: %s
249	* Daniel Veillard <[email protected]>
250	*/
251
252	#define IN_LIBXML
253	#include "libxml.h"
254
255	#ifdef LIBXML_UNICODE_ENABLED
256
257	#include <string.h>
258	#include <libxml/xmlversion.h>
259	#include <libxml/xmlunicode.h>
260	#include <libxml/chvalid.h>
261
262	typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */
263
264	typedef struct {
265	const char *rangename;
266	xmlIntFunc *func;
267	} xmlUnicodeRange;
268
269	typedef struct {
270	xmlUnicodeRange *table;
271	int numentries;
272	} xmlUnicodeNameTable;
273
274
275	static xmlIntFunc xmlUnicodeLookup(xmlUnicodeNameTable tptr, const char *tname);
276
277	static xmlUnicodeRange xmlUnicodeBlocks[] = {
278	""" % (webpage, date, sources));
279
280	flag = 0
281	for block in bkeys:
282	name = string.replace(block, '-', '')
283	if flag:
284	output.write(',\n')
285	else:
286	flag = 1
287	output.write(' {"%s", xmlUCSIs%s}' % (block, name))
288	output.write('};\n\n')
289
290	output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
291	flag = 0;
292	for name in ckeys:
293	if flag:
294	output.write(',\n')
295	else:
296	flag = 1
297	output.write(' {"%s", xmlUCSIsCat%s}' % (name, name))
298	output.write('};\n\n')
299
300	#
301	# For any categories with more than minTableSize ranges we generate
302	# a range table suitable for xmlCharInRange
303	#
304	for name in ckeys:
305	if len(Categories[name]) > minTableSize:
306	numshort = 0
307	numlong = 0
308	ranges = Categories[name]
309	sptr = "NULL"
310	lptr = "NULL"
311	for range in ranges:
312	(low, high) = range
313	if high < 0x10000:
314	if numshort == 0:
315	pline = "static const xmlChSRange xml%sS[] = {" % name
316	sptr = "xml%sS" % name
317	else:
318	pline += ", "
319	numshort += 1
320	else:
321	if numlong == 0:
322	if numshort > 0:
323	output.write(pline + " };\n")
324	pline = "static const xmlChLRange xml%sL[] = {" % name
325	lptr = "xml%sL" % name
326	else:
327	pline += ", "
328	numlong += 1
329	if len(pline) > 60:
330	output.write(pline + "\n")
331	pline = " "
332	pline += "{%s, %s}" % (hex(low), hex(high))
333	output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
334	% (name, numshort, numlong, sptr, lptr))
335
336
337	output.write(
338	"""static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
339	static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
340
341	/**
342	* xmlUnicodeLookup:
343	* @tptr: pointer to the name table
344	* @name: name to be found
345	*
346	* binary table lookup for user-supplied name
347	*
348	* Returns pointer to range function if found, otherwise NULL
349	*/
350	static xmlIntFunc
351	xmlUnicodeLookup(xmlUnicodeNameTable tptr, const char *tname) {
352	int low, high, mid, cmp;
353	xmlUnicodeRange *sptr;
354
355	if ((tptr == NULL) \|\| (tname == NULL)) return(NULL);
356
357	low = 0;
358	high = tptr->numentries - 1;
359	sptr = tptr->table;
360	while (low <= high) {
361	mid = (low + high) / 2;
362	if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
363	return (sptr[mid].func);
364	if (cmp < 0)
365	high = mid - 1;
366	else
367	low = mid + 1;
368	}
369	return (NULL);
370	}
371
372	""" % (len(BlockNames), len(Categories)) )
373
374	for block in bkeys:
375	name = string.replace(block, '-', '')
376	header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
377	output.write("/*\n xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
378	output.write(" \n Check whether the character is part of %s UCS Block\n"%
379	(block))
380	output.write(" \n Returns 1 if true 0 otherwise\n */\n");
381	output.write("int\nxmlUCSIs%s(int code) {\n return(" % name)
382	flag = 0
383	for (start, end) in BlockNames[block]:
384	if flag:
385	output.write(" \|\|\n ")
386	else:
387	flag = 1
388	output.write("((code >= %s) && (code <= %s))" % (start, end))
389	output.write(");\n}\n\n")
390
391	header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
392	output.write(
393	"""/**
394	* xmlUCSIsBlock:
395	* @code: UCS code point
396	* @block: UCS block name
397	*
398	* Check whether the character is part of the UCS Block
399	*
400	* Returns 1 if true, 0 if false and -1 on unknown block
401	*/
402	int
403	xmlUCSIsBlock(int code, const char *block) {
404	xmlIntFunc *func;
405
406	func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
407	if (func == NULL)
408	return (-1);
409	return (func(code));
410	}
411
412	""")
413
414	for name in ckeys:
415	ranges = Categories[name]
416	header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
417	output.write("/*\n xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
418	output.write(" \n Check whether the character is part of %s UCS Category\n"%
419	(name))
420	output.write(" \n Returns 1 if true 0 otherwise\n */\n");
421	output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
422	if len(Categories[name]) > minTableSize:
423	output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)"
424	% name)
425	else:
426	start = 1
427	for range in ranges:
428	(begin, end) = range;
429	if start:
430	output.write(" return(");
431	start = 0
432	else:
433	output.write(" \|\|\n ");
434	if (begin == end):
435	output.write("(code == %s)" % (hex(begin)))
436	else:
437	output.write("((code >= %s) && (code <= %s))" % (
438	hex(begin), hex(end)))
439	output.write(");\n}\n\n")
440
441	header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
442	output.write(
443	"""/**
444	* xmlUCSIsCat:
445	* @code: UCS code point
446	* @cat: UCS Category name
447	*
448	* Check whether the character is part of the UCS Category
449	*
450	* Returns 1 if true, 0 if false and -1 on unknown category
451	*/
452	int
453	xmlUCSIsCat(int code, const char *cat) {
454	xmlIntFunc *func;
455
456	func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
457	if (func == NULL)
458	return (-1);
459	return (func(code));
460	}
461
462	#define bottom_xmlunicode
463	#include "elfgcchack.h"
464	#endif /* LIBXML_UNICODE_ENABLED */
465	""")
466
467	header.write("""
468	#ifdef __cplusplus
469	}
470	#endif
471
472	#endif /* LIBXML_UNICODE_ENABLED */
473
474	#endif /* __XML_UNICODE_H__ */
475	""");
476
477	header.close()
478	output.close()

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/libxml2-2.6.30/genUnicode.py@ 9259

Download in other formats: