index.py@ 78404

Last change on this file since 78404 was 65950, checked in by vboxsync, 8 years ago
libxml 2.9.4: fix export
Property svn:eol-style set to `LF` Property svn:executable set to ``*
File size: 32.1 KB

Line
1	#!/usr/bin/python -u
2	#
3	# imports the API description and fills up a database with
4	# name relevance to modules, functions or web pages
5	#
6	# Operation needed:
7	# =================
8	#
9	# install mysqld, the python wrappers for mysql and libxml2, start mysqld
10	# Change the root passwd of mysql:
11	# mysqladmin -u root password new_password
12	# Create the new database xmlsoft
13	# mysqladmin -p create xmlsoft
14	# Create a database user 'veillard' and give him passord access
15	# change veillard and abcde with the right user name and passwd
16	# mysql -p
17	# password:
18	# mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
19	# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
20	#
21	# As the user check the access:
22	# mysql -p xmlsoft
23	# Enter password:
24	# Welcome to the MySQL monitor....
25	# mysql> use xmlsoft
26	# Database changed
27	# mysql> quit
28	# Bye
29	#
30	# Then run the script in the doc subdir, it will create the symbols and
31	# word tables and populate them with informations extracted from
32	# the libxml2-api.xml API description, and make then accessible read-only
33	# by nobody@loaclhost the user expected to be Apache's one
34	#
35	# On the Apache configuration, make sure you have php support enabled
36	#
37
38	import MySQLdb
39	import libxml2
40	import sys
41	import string
42	import os
43
44	#
45	# We are not interested in parsing errors here
46	#
47	def callback(ctx, str):
48	return
49	libxml2.registerErrorHandler(callback, None)
50
51	#
52	# The dictionary of tables required and the SQL command needed
53	# to create them
54	#
55	TABLES={
56	"symbols" : """CREATE TABLE symbols (
57	name varchar(255) BINARY NOT NULL,
58	module varchar(255) BINARY NOT NULL,
59	type varchar(25) NOT NULL,
60	descr varchar(255),
61	UNIQUE KEY name (name),
62	KEY module (module))""",
63	"words" : """CREATE TABLE words (
64	name varchar(50) BINARY NOT NULL,
65	symbol varchar(255) BINARY NOT NULL,
66	relevance int,
67	KEY name (name),
68	KEY symbol (symbol),
69	UNIQUE KEY ID (name, symbol))""",
70	"wordsHTML" : """CREATE TABLE wordsHTML (
71	name varchar(50) BINARY NOT NULL,
72	resource varchar(255) BINARY NOT NULL,
73	section varchar(255),
74	id varchar(50),
75	relevance int,
76	KEY name (name),
77	KEY resource (resource),
78	UNIQUE KEY ref (name, resource))""",
79	"wordsArchive" : """CREATE TABLE wordsArchive (
80	name varchar(50) BINARY NOT NULL,
81	ID int(11) NOT NULL,
82	relevance int,
83	KEY name (name),
84	UNIQUE KEY ref (name, ID))""",
85	"pages" : """CREATE TABLE pages (
86	resource varchar(255) BINARY NOT NULL,
87	title varchar(255) BINARY NOT NULL,
88	UNIQUE KEY name (resource))""",
89	"archives" : """CREATE TABLE archives (
90	ID int(11) NOT NULL auto_increment,
91	resource varchar(255) BINARY NOT NULL,
92	title varchar(255) BINARY NOT NULL,
93	UNIQUE KEY id (ID,resource(255)),
94	INDEX (ID),
95	INDEX (resource))""",
96	"Queries" : """CREATE TABLE Queries (
97	ID int(11) NOT NULL auto_increment,
98	Value varchar(50) NOT NULL,
99	Count int(11) NOT NULL,
100	UNIQUE KEY id (ID,Value(35)),
101	INDEX (ID))""",
102	"AllQueries" : """CREATE TABLE AllQueries (
103	ID int(11) NOT NULL auto_increment,
104	Value varchar(50) NOT NULL,
105	Count int(11) NOT NULL,
106	UNIQUE KEY id (ID,Value(35)),
107	INDEX (ID))""",
108	}
109
110	#
111	# The XML API description file to parse
112	#
113	API="libxml2-api.xml"
114	DB=None
115
116	#########################################################################
117	# #
118	# MySQL database interfaces #
119	# #
120	#########################################################################
121	def createTable(db, name):
122	global TABLES
123
124	if db == None:
125	return -1
126	if name == None:
127	return -1
128	c = db.cursor()
129
130	ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
131	if ret == 1:
132	print "Removed table %s" % (name)
133	print "Creating table %s" % (name)
134	try:
135	ret = c.execute(TABLES[name])
136	except:
137	print "Failed to create table %s" % (name)
138	return -1
139	return ret
140
141	def checkTables(db, verbose = 1):
142	global TABLES
143
144	if db == None:
145	return -1
146	c = db.cursor()
147	nbtables = c.execute("show tables")
148	if verbose:
149	print "Found %d tables" % (nbtables)
150	tables = {}
151	i = 0
152	while i < nbtables:
153	l = c.fetchone()
154	name = l[0]
155	tables[name] = {}
156	i = i + 1
157
158	for table in TABLES.keys():
159	if not tables.has_key(table):
160	print "table %s missing" % (table)
161	createTable(db, table)
162	try:
163	ret = c.execute("SELECT count(*) from %s" % table);
164	row = c.fetchone()
165	if verbose:
166	print "Table %s contains %d records" % (table, row[0])
167	except:
168	print "Troubles with table %s : repairing" % (table)
169	ret = c.execute("repair table %s" % table);
170	print "repairing returned %d" % (ret)
171	ret = c.execute("SELECT count(*) from %s" % table);
172	row = c.fetchone()
173	print "Table %s contains %d records" % (table, row[0])
174	if verbose:
175	print "checkTables finished"
176
177	# make sure apache can access the tables read-only
178	try:
179	ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
180	ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
181	except:
182	pass
183	return 0
184
185	def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
186	global DB
187
188	if passwd == None:
189	try:
190	passwd = os.environ["MySQL_PASS"]
191	except:
192	print "No password available, set environment MySQL_PASS"
193	sys.exit(1)
194
195	DB = MySQLdb.connect(passwd=passwd, db=db)
196	if DB == None:
197	return -1
198	ret = checkTables(DB, verbose)
199	return ret
200
201	def updateWord(name, symbol, relevance):
202	global DB
203
204	if DB == None:
205	openMySQL()
206	if DB == None:
207	return -1
208	if name == None:
209	return -1
210	if symbol == None:
211	return -1
212
213	c = DB.cursor()
214	try:
215	ret = c.execute(
216	"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
217	(name, symbol, relevance))
218	except:
219	try:
220	ret = c.execute(
221	"""UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
222	(relevance, name, symbol))
223	except:
224	print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
225	print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
226	print sys.exc_type, sys.exc_value
227	return -1
228
229	return ret
230
231	def updateSymbol(name, module, type, desc):
232	global DB
233
234	updateWord(name, name, 50)
235	if DB == None:
236	openMySQL()
237	if DB == None:
238	return -1
239	if name == None:
240	return -1
241	if module == None:
242	return -1
243	if type == None:
244	return -1
245
246	try:
247	desc = string.replace(desc, "'", " ")
248	l = string.split(desc, ".")
249	desc = l[0]
250	desc = desc[0:99]
251	except:
252	desc = ""
253
254	c = DB.cursor()
255	try:
256	ret = c.execute(
257	"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
258	(name, module, type, desc))
259	except:
260	try:
261	ret = c.execute(
262	"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
263	(module, type, desc, name))
264	except:
265	print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
266	print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
267	print sys.exc_type, sys.exc_value
268	return -1
269
270	return ret
271
272	def addFunction(name, module, desc = ""):
273	return updateSymbol(name, module, 'function', desc)
274
275	def addMacro(name, module, desc = ""):
276	return updateSymbol(name, module, 'macro', desc)
277
278	def addEnum(name, module, desc = ""):
279	return updateSymbol(name, module, 'enum', desc)
280
281	def addStruct(name, module, desc = ""):
282	return updateSymbol(name, module, 'struct', desc)
283
284	def addConst(name, module, desc = ""):
285	return updateSymbol(name, module, 'const', desc)
286
287	def addType(name, module, desc = ""):
288	return updateSymbol(name, module, 'type', desc)
289
290	def addFunctype(name, module, desc = ""):
291	return updateSymbol(name, module, 'functype', desc)
292
293	def addPage(resource, title):
294	global DB
295
296	if DB == None:
297	openMySQL()
298	if DB == None:
299	return -1
300	if resource == None:
301	return -1
302
303	c = DB.cursor()
304	try:
305	ret = c.execute(
306	"""INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
307	(resource, title))
308	except:
309	try:
310	ret = c.execute(
311	"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
312	(title, resource))
313	except:
314	print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
315	print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
316	print sys.exc_type, sys.exc_value
317	return -1
318
319	return ret
320
321	def updateWordHTML(name, resource, desc, id, relevance):
322	global DB
323
324	if DB == None:
325	openMySQL()
326	if DB == None:
327	return -1
328	if name == None:
329	return -1
330	if resource == None:
331	return -1
332	if id == None:
333	id = ""
334	if desc == None:
335	desc = ""
336	else:
337	try:
338	desc = string.replace(desc, "'", " ")
339	desc = desc[0:99]
340	except:
341	desc = ""
342
343	c = DB.cursor()
344	try:
345	ret = c.execute(
346	"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
347	(name, resource, desc, id, relevance))
348	except:
349	try:
350	ret = c.execute(
351	"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
352	(desc, id, relevance, name, resource))
353	except:
354	print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
355	print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
356	print sys.exc_type, sys.exc_value
357	return -1
358
359	return ret
360
361	def checkXMLMsgArchive(url):
362	global DB
363
364	if DB == None:
365	openMySQL()
366	if DB == None:
367	return -1
368	if url == None:
369	return -1
370
371	c = DB.cursor()
372	try:
373	ret = c.execute(
374	"""SELECT ID FROM archives WHERE resource='%s'""" % (url))
375	row = c.fetchone()
376	if row == None:
377	return -1
378	except:
379	return -1
380
381	return row[0]
382
383	def addXMLMsgArchive(url, title):
384	global DB
385
386	if DB == None:
387	openMySQL()
388	if DB == None:
389	return -1
390	if url == None:
391	return -1
392	if title == None:
393	title = ""
394	else:
395	title = string.replace(title, "'", " ")
396	title = title[0:99]
397
398	c = DB.cursor()
399	try:
400	cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
401	ret = c.execute(cmd)
402	cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
403	ret = c.execute(cmd)
404	row = c.fetchone()
405	if row == None:
406	print "addXMLMsgArchive failed to get the ID: %s" % (url)
407	return -1
408	except:
409	print "addXMLMsgArchive failed command: %s" % (cmd)
410	return -1
411
412	return((int)(row[0]))
413
414	def updateWordArchive(name, id, relevance):
415	global DB
416
417	if DB == None:
418	openMySQL()
419	if DB == None:
420	return -1
421	if name == None:
422	return -1
423	if id == None:
424	return -1
425
426	c = DB.cursor()
427	try:
428	ret = c.execute(
429	"""INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
430	(name, id, relevance))
431	except:
432	try:
433	ret = c.execute(
434	"""UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
435	(relevance, name, id))
436	except:
437	print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
438	print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
439	print sys.exc_type, sys.exc_value
440	return -1
441
442	return ret
443
444	#########################################################################
445	# #
446	# Word dictionary and analysis routines #
447	# #
448	#########################################################################
449
450	#
451	# top 100 english word without the one len < 3 + own set
452	#
453	dropWords = {
454	'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
455	'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
456	'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
457	'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
458	'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
459	'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
460	'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
461	'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
462	'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
463	'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
464	'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
465	'down':0,
466	'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
467	}
468
469	wordsDict = {}
470	wordsDictHTML = {}
471	wordsDictArchive = {}
472
473	def cleanupWordsString(str):
474	str = string.replace(str, ".", " ")
475	str = string.replace(str, "!", " ")
476	str = string.replace(str, "?", " ")
477	str = string.replace(str, ",", " ")
478	str = string.replace(str, "'", " ")
479	str = string.replace(str, '"', " ")
480	str = string.replace(str, ";", " ")
481	str = string.replace(str, "(", " ")
482	str = string.replace(str, ")", " ")
483	str = string.replace(str, "{", " ")
484	str = string.replace(str, "}", " ")
485	str = string.replace(str, "<", " ")
486	str = string.replace(str, ">", " ")
487	str = string.replace(str, "=", " ")
488	str = string.replace(str, "/", " ")
489	str = string.replace(str, "*", " ")
490	str = string.replace(str, ":", " ")
491	str = string.replace(str, "#", " ")
492	str = string.replace(str, "\\", " ")
493	str = string.replace(str, "\n", " ")
494	str = string.replace(str, "\r", " ")
495	str = string.replace(str, "\xc2", " ")
496	str = string.replace(str, "\xa0", " ")
497	return str
498
499	def cleanupDescrString(str):
500	str = string.replace(str, "'", " ")
501	str = string.replace(str, "\n", " ")
502	str = string.replace(str, "\r", " ")
503	str = string.replace(str, "\xc2", " ")
504	str = string.replace(str, "\xa0", " ")
505	l = string.split(str)
506	str = string.join(str)
507	return str
508
509	def splitIdentifier(str):
510	ret = []
511	while str != "":
512	cur = string.lower(str[0])
513	str = str[1:]
514	if ((cur < 'a') or (cur > 'z')):
515	continue
516	while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
517	cur = cur + string.lower(str[0])
518	str = str[1:]
519	while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
520	cur = cur + str[0]
521	str = str[1:]
522	while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
523	str = str[1:]
524	ret.append(cur)
525	return ret
526
527	def addWord(word, module, symbol, relevance):
528	global wordsDict
529
530	if word == None or len(word) < 3:
531	return -1
532	if module == None or symbol == None:
533	return -1
534	if dropWords.has_key(word):
535	return 0
536	if ord(word[0]) > 0x80:
537	return 0
538
539	if wordsDict.has_key(word):
540	d = wordsDict[word]
541	if d == None:
542	return 0
543	if len(d) > 500:
544	wordsDict[word] = None
545	return 0
546	try:
547	relevance = relevance + d[(module, symbol)]
548	except:
549	pass
550	else:
551	wordsDict[word] = {}
552	wordsDict[word][(module, symbol)] = relevance
553	return relevance
554
555	def addString(str, module, symbol, relevance):
556	if str == None or len(str) < 3:
557	return -1
558	ret = 0
559	str = cleanupWordsString(str)
560	l = string.split(str)
561	for word in l:
562	if len(word) > 2:
563	ret = ret + addWord(word, module, symbol, 5)
564
565	return ret
566
567	def addWordHTML(word, resource, id, section, relevance):
568	global wordsDictHTML
569
570	if word == None or len(word) < 3:
571	return -1
572	if resource == None or section == None:
573	return -1
574	if dropWords.has_key(word):
575	return 0
576	if ord(word[0]) > 0x80:
577	return 0
578
579	section = cleanupDescrString(section)
580
581	if wordsDictHTML.has_key(word):
582	d = wordsDictHTML[word]
583	if d == None:
584	print "skipped %s" % (word)
585	return 0
586	try:
587	(r,i,s) = d[resource]
588	if i != None:
589	id = i
590	if s != None:
591	section = s
592	relevance = relevance + r
593	except:
594	pass
595	else:
596	wordsDictHTML[word] = {}
597	d = wordsDictHTML[word];
598	d[resource] = (relevance, id, section)
599	return relevance
600
601	def addStringHTML(str, resource, id, section, relevance):
602	if str == None or len(str) < 3:
603	return -1
604	ret = 0
605	str = cleanupWordsString(str)
606	l = string.split(str)
607	for word in l:
608	if len(word) > 2:
609	try:
610	r = addWordHTML(word, resource, id, section, relevance)
611	if r < 0:
612	print "addWordHTML failed: %s %s" % (word, resource)
613	ret = ret + r
614	except:
615	print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
616	print sys.exc_type, sys.exc_value
617
618	return ret
619
620	def addWordArchive(word, id, relevance):
621	global wordsDictArchive
622
623	if word == None or len(word) < 3:
624	return -1
625	if id == None or id == -1:
626	return -1
627	if dropWords.has_key(word):
628	return 0
629	if ord(word[0]) > 0x80:
630	return 0
631
632	if wordsDictArchive.has_key(word):
633	d = wordsDictArchive[word]
634	if d == None:
635	print "skipped %s" % (word)
636	return 0
637	try:
638	r = d[id]
639	relevance = relevance + r
640	except:
641	pass
642	else:
643	wordsDictArchive[word] = {}
644	d = wordsDictArchive[word];
645	d[id] = relevance
646	return relevance
647
648	def addStringArchive(str, id, relevance):
649	if str == None or len(str) < 3:
650	return -1
651	ret = 0
652	str = cleanupWordsString(str)
653	l = string.split(str)
654	for word in l:
655	i = len(word)
656	if i > 2:
657	try:
658	r = addWordArchive(word, id, relevance)
659	if r < 0:
660	print "addWordArchive failed: %s %s" % (word, id)
661	else:
662	ret = ret + r
663	except:
664	print "addWordArchive failed: %s %s %d" % (word, id, relevance)
665	print sys.exc_type, sys.exc_value
666	return ret
667
668	#########################################################################
669	# #
670	# XML API description analysis #
671	# #
672	#########################################################################
673
674	def loadAPI(filename):
675	doc = libxml2.parseFile(filename)
676	print "loaded %s" % (filename)
677	return doc
678
679	def foundExport(file, symbol):
680	if file == None:
681	return 0
682	if symbol == None:
683	return 0
684	addFunction(symbol, file)
685	l = splitIdentifier(symbol)
686	for word in l:
687	addWord(word, file, symbol, 10)
688	return 1
689
690	def analyzeAPIFile(top):
691	count = 0
692	name = top.prop("name")
693	cur = top.children
694	while cur != None:
695	if cur.type == 'text':
696	cur = cur.next
697	continue
698	if cur.name == "exports":
699	count = count + foundExport(name, cur.prop("symbol"))
700	else:
701	print "unexpected element %s in API doc <file name='%s'>" % (name)
702	cur = cur.next
703	return count
704
705	def analyzeAPIFiles(top):
706	count = 0
707	cur = top.children
708
709	while cur != None:
710	if cur.type == 'text':
711	cur = cur.next
712	continue
713	if cur.name == "file":
714	count = count + analyzeAPIFile(cur)
715	else:
716	print "unexpected element %s in API doc <files>" % (cur.name)
717	cur = cur.next
718	return count
719
720	def analyzeAPIEnum(top):
721	file = top.prop("file")
722	if file == None:
723	return 0
724	symbol = top.prop("name")
725	if symbol == None:
726	return 0
727
728	addEnum(symbol, file)
729	l = splitIdentifier(symbol)
730	for word in l:
731	addWord(word, file, symbol, 10)
732
733	return 1
734
735	def analyzeAPIConst(top):
736	file = top.prop("file")
737	if file == None:
738	return 0
739	symbol = top.prop("name")
740	if symbol == None:
741	return 0
742
743	addConst(symbol, file)
744	l = splitIdentifier(symbol)
745	for word in l:
746	addWord(word, file, symbol, 10)
747
748	return 1
749
750	def analyzeAPIType(top):
751	file = top.prop("file")
752	if file == None:
753	return 0
754	symbol = top.prop("name")
755	if symbol == None:
756	return 0
757
758	addType(symbol, file)
759	l = splitIdentifier(symbol)
760	for word in l:
761	addWord(word, file, symbol, 10)
762	return 1
763
764	def analyzeAPIFunctype(top):
765	file = top.prop("file")
766	if file == None:
767	return 0
768	symbol = top.prop("name")
769	if symbol == None:
770	return 0
771
772	addFunctype(symbol, file)
773	l = splitIdentifier(symbol)
774	for word in l:
775	addWord(word, file, symbol, 10)
776	return 1
777
778	def analyzeAPIStruct(top):
779	file = top.prop("file")
780	if file == None:
781	return 0
782	symbol = top.prop("name")
783	if symbol == None:
784	return 0
785
786	addStruct(symbol, file)
787	l = splitIdentifier(symbol)
788	for word in l:
789	addWord(word, file, symbol, 10)
790
791	info = top.prop("info")
792	if info != None:
793	info = string.replace(info, "'", " ")
794	info = string.strip(info)
795	l = string.split(info)
796	for word in l:
797	if len(word) > 2:
798	addWord(word, file, symbol, 5)
799	return 1
800
801	def analyzeAPIMacro(top):
802	file = top.prop("file")
803	if file == None:
804	return 0
805	symbol = top.prop("name")
806	if symbol == None:
807	return 0
808	symbol = string.replace(symbol, "'", " ")
809	symbol = string.strip(symbol)
810
811	info = None
812	cur = top.children
813	while cur != None:
814	if cur.type == 'text':
815	cur = cur.next
816	continue
817	if cur.name == "info":
818	info = cur.content
819	break
820	cur = cur.next
821
822	l = splitIdentifier(symbol)
823	for word in l:
824	addWord(word, file, symbol, 10)
825
826	if info == None:
827	addMacro(symbol, file)
828	print "Macro %s description has no <info>" % (symbol)
829	return 0
830
831	info = string.replace(info, "'", " ")
832	info = string.strip(info)
833	addMacro(symbol, file, info)
834	l = string.split(info)
835	for word in l:
836	if len(word) > 2:
837	addWord(word, file, symbol, 5)
838	return 1
839
840	def analyzeAPIFunction(top):
841	file = top.prop("file")
842	if file == None:
843	return 0
844	symbol = top.prop("name")
845	if symbol == None:
846	return 0
847
848	symbol = string.replace(symbol, "'", " ")
849	symbol = string.strip(symbol)
850	info = None
851	cur = top.children
852	while cur != None:
853	if cur.type == 'text':
854	cur = cur.next
855	continue
856	if cur.name == "info":
857	info = cur.content
858	elif cur.name == "return":
859	rinfo = cur.prop("info")
860	if rinfo != None:
861	rinfo = string.replace(rinfo, "'", " ")
862	rinfo = string.strip(rinfo)
863	addString(rinfo, file, symbol, 7)
864	elif cur.name == "arg":
865	ainfo = cur.prop("info")
866	if ainfo != None:
867	ainfo = string.replace(ainfo, "'", " ")
868	ainfo = string.strip(ainfo)
869	addString(ainfo, file, symbol, 5)
870	name = cur.prop("name")
871	if name != None:
872	name = string.replace(name, "'", " ")
873	name = string.strip(name)
874	addWord(name, file, symbol, 7)
875	cur = cur.next
876	if info == None:
877	print "Function %s description has no <info>" % (symbol)
878	addFunction(symbol, file, "")
879	else:
880	info = string.replace(info, "'", " ")
881	info = string.strip(info)
882	addFunction(symbol, file, info)
883	addString(info, file, symbol, 5)
884
885	l = splitIdentifier(symbol)
886	for word in l:
887	addWord(word, file, symbol, 10)
888
889	return 1
890
891	def analyzeAPISymbols(top):
892	count = 0
893	cur = top.children
894
895	while cur != None:
896	if cur.type == 'text':
897	cur = cur.next
898	continue
899	if cur.name == "macro":
900	count = count + analyzeAPIMacro(cur)
901	elif cur.name == "function":
902	count = count + analyzeAPIFunction(cur)
903	elif cur.name == "const":
904	count = count + analyzeAPIConst(cur)
905	elif cur.name == "typedef":
906	count = count + analyzeAPIType(cur)
907	elif cur.name == "struct":
908	count = count + analyzeAPIStruct(cur)
909	elif cur.name == "enum":
910	count = count + analyzeAPIEnum(cur)
911	elif cur.name == "functype":
912	count = count + analyzeAPIFunctype(cur)
913	else:
914	print "unexpected element %s in API doc <files>" % (cur.name)
915	cur = cur.next
916	return count
917
918	def analyzeAPI(doc):
919	count = 0
920	if doc == None:
921	return -1
922	root = doc.getRootElement()
923	if root.name != "api":
924	print "Unexpected root name"
925	return -1
926	cur = root.children
927	while cur != None:
928	if cur.type == 'text':
929	cur = cur.next
930	continue
931	if cur.name == "files":
932	pass
933	# count = count + analyzeAPIFiles(cur)
934	elif cur.name == "symbols":
935	count = count + analyzeAPISymbols(cur)
936	else:
937	print "unexpected element %s in API doc" % (cur.name)
938	cur = cur.next
939	return count
940
941	#########################################################################
942	# #
943	# Web pages parsing and analysis #
944	# #
945	#########################################################################
946
947	import glob
948
949	def analyzeHTMLText(doc, resource, p, section, id):
950	words = 0
951	try:
952	content = p.content
953	words = words + addStringHTML(content, resource, id, section, 5)
954	except:
955	return -1
956	return words
957
958	def analyzeHTMLPara(doc, resource, p, section, id):
959	words = 0
960	try:
961	content = p.content
962	words = words + addStringHTML(content, resource, id, section, 5)
963	except:
964	return -1
965	return words
966
967	def analyzeHTMLPre(doc, resource, p, section, id):
968	words = 0
969	try:
970	content = p.content
971	words = words + addStringHTML(content, resource, id, section, 5)
972	except:
973	return -1
974	return words
975
976	def analyzeHTML(doc, resource, p, section, id):
977	words = 0
978	try:
979	content = p.content
980	words = words + addStringHTML(content, resource, id, section, 5)
981	except:
982	return -1
983	return words
984
985	def analyzeHTML(doc, resource):
986	para = 0;
987	ctxt = doc.xpathNewContext()
988	try:
989	res = ctxt.xpathEval("//head/title")
990	title = res[0].content
991	except:
992	title = "Page %s" % (resource)
993	addPage(resource, title)
994	try:
995	items = ctxt.xpathEval("//h1 \| //h2 \| //h3 \| //text()")
996	section = title
997	id = ""
998	for item in items:
999	if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
1000	section = item.content
1001	if item.prop("id"):
1002	id = item.prop("id")
1003	elif item.prop("name"):
1004	id = item.prop("name")
1005	elif item.type == 'text':
1006	analyzeHTMLText(doc, resource, item, section, id)
1007	para = para + 1
1008	elif item.name == 'p':
1009	analyzeHTMLPara(doc, resource, item, section, id)
1010	para = para + 1
1011	elif item.name == 'pre':
1012	analyzeHTMLPre(doc, resource, item, section, id)
1013	para = para + 1
1014	else:
1015	print "Page %s, unexpected %s element" % (resource, item.name)
1016	except:
1017	print "Page %s: problem analyzing" % (resource)
1018	print sys.exc_type, sys.exc_value
1019
1020	return para
1021
1022	def analyzeHTMLPages():
1023	ret = 0
1024	HTMLfiles = glob.glob(".html") + glob.glob("tutorial/.html")
1025	for html in HTMLfiles:
1026	if html[0:3] == "API":
1027	continue
1028	if html == "xml.html":
1029	continue
1030	try:
1031	doc = libxml2.parseFile(html)
1032	except:
1033	doc = libxml2.htmlParseFile(html, None)
1034	try:
1035	res = analyzeHTML(doc, html)
1036	print "Parsed %s : %d paragraphs" % (html, res)
1037	ret = ret + 1
1038	except:
1039	print "could not parse %s" % (html)
1040	return ret
1041
1042	#########################################################################
1043	# #
1044	# Mail archives parsing and analysis #
1045	# #
1046	#########################################################################
1047
1048	import time
1049
1050	def getXMLDateArchive(t = None):
1051	if t == None:
1052	t = time.time()
1053	T = time.gmtime(t)
1054	month = time.strftime("%B", T)
1055	year = T[0]
1056	url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
1057	return url
1058
1059	def scanXMLMsgArchive(url, title, force = 0):
1060	if url == None or title == None:
1061	return 0
1062
1063	ID = checkXMLMsgArchive(url)
1064	if force == 0 and ID != -1:
1065	return 0
1066
1067	if ID == -1:
1068	ID = addXMLMsgArchive(url, title)
1069	if ID == -1:
1070	return 0
1071
1072	try:
1073	print "Loading %s" % (url)
1074	doc = libxml2.htmlParseFile(url, None);
1075	except:
1076	doc = None
1077	if doc == None:
1078	print "Failed to parse %s" % (url)
1079	return 0
1080
1081	addStringArchive(title, ID, 20)
1082	ctxt = doc.xpathNewContext()
1083	texts = ctxt.xpathEval("//pre//text()")
1084	for text in texts:
1085	addStringArchive(text.content, ID, 5)
1086
1087	return 1
1088
1089	def scanXMLDateArchive(t = None, force = 0):
1090	global wordsDictArchive
1091
1092	wordsDictArchive = {}
1093
1094	url = getXMLDateArchive(t)
1095	print "loading %s" % (url)
1096	try:
1097	doc = libxml2.htmlParseFile(url, None);
1098	except:
1099	doc = None
1100	if doc == None:
1101	print "Failed to parse %s" % (url)
1102	return -1
1103	ctxt = doc.xpathNewContext()
1104	anchors = ctxt.xpathEval("//a[@href]")
1105	links = 0
1106	newmsg = 0
1107	for anchor in anchors:
1108	href = anchor.prop("href")
1109	if href == None or href[0:3] != "msg":
1110	continue
1111	try:
1112	links = links + 1
1113
1114	msg = libxml2.buildURI(href, url)
1115	title = anchor.content
1116	if title != None and title[0:4] == 'Re: ':
1117	title = title[4:]
1118	if title != None and title[0:6] == '[xml] ':
1119	title = title[6:]
1120	newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
1121
1122	except:
1123	pass
1124
1125	return newmsg
1126
1127
1128	#########################################################################
1129	# #
1130	# Main code: open the DB, the API XML and analyze it #
1131	# #
1132	#########################################################################
1133	def analyzeArchives(t = None, force = 0):
1134	global wordsDictArchive
1135
1136	ret = scanXMLDateArchive(t, force)
1137	print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
1138
1139	i = 0
1140	skipped = 0
1141	for word in wordsDictArchive.keys():
1142	refs = wordsDictArchive[word]
1143	if refs == None:
1144	skipped = skipped + 1
1145	continue;
1146	for id in refs.keys():
1147	relevance = refs[id]
1148	updateWordArchive(word, id, relevance)
1149	i = i + 1
1150
1151	print "Found %d associations in HTML pages" % (i)
1152
1153	def analyzeHTMLTop():
1154	global wordsDictHTML
1155
1156	ret = analyzeHTMLPages()
1157	print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
1158
1159	i = 0
1160	skipped = 0
1161	for word in wordsDictHTML.keys():
1162	refs = wordsDictHTML[word]
1163	if refs == None:
1164	skipped = skipped + 1
1165	continue;
1166	for resource in refs.keys():
1167	(relevance, id, section) = refs[resource]
1168	updateWordHTML(word, resource, section, id, relevance)
1169	i = i + 1
1170
1171	print "Found %d associations in HTML pages" % (i)
1172
1173	def analyzeAPITop():
1174	global wordsDict
1175	global API
1176
1177	try:
1178	doc = loadAPI(API)
1179	ret = analyzeAPI(doc)
1180	print "Analyzed %d blocs" % (ret)
1181	doc.freeDoc()
1182	except:
1183	print "Failed to parse and analyze %s" % (API)
1184	print sys.exc_type, sys.exc_value
1185	sys.exit(1)
1186
1187	print "Indexed %d words" % (len(wordsDict))
1188	i = 0
1189	skipped = 0
1190	for word in wordsDict.keys():
1191	refs = wordsDict[word]
1192	if refs == None:
1193	skipped = skipped + 1
1194	continue;
1195	for (module, symbol) in refs.keys():
1196	updateWord(word, symbol, refs[(module, symbol)])
1197	i = i + 1
1198
1199	print "Found %d associations, skipped %d words" % (i, skipped)
1200
1201	def usage():
1202	print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]"
1203	sys.exit(1)
1204
1205	def main():
1206	try:
1207	openMySQL()
1208	except:
1209	print "Failed to open the database"
1210	print sys.exc_type, sys.exc_value
1211	sys.exit(1)
1212
1213	args = sys.argv[1:]
1214	force = 0
1215	if args:
1216	i = 0
1217	while i < len(args):
1218	if args[i] == '--force':
1219	force = 1
1220	elif args[i] == '--archive':
1221	analyzeArchives(None, force)
1222	elif args[i] == '--archive-year':
1223	i = i + 1;
1224	year = args[i]
1225	months = ["January" , "February", "March", "April", "May",
1226	"June", "July", "August", "September", "October",
1227	"November", "December"];
1228	for month in months:
1229	try:
1230	str = "%s-%s" % (year, month)
1231	T = time.strptime(str, "%Y-%B")
1232	t = time.mktime(T) + 3600 * 24 * 10;
1233	analyzeArchives(t, force)
1234	except:
1235	print "Failed to index month archive:"
1236	print sys.exc_type, sys.exc_value
1237	elif args[i] == '--archive-month':
1238	i = i + 1;
1239	month = args[i]
1240	try:
1241	T = time.strptime(month, "%Y-%B")
1242	t = time.mktime(T) + 3600 * 24 * 10;
1243	analyzeArchives(t, force)
1244	except:
1245	print "Failed to index month archive:"
1246	print sys.exc_type, sys.exc_value
1247	elif args[i] == '--API':
1248	analyzeAPITop()
1249	elif args[i] == '--docs':
1250	analyzeHTMLTop()
1251	else:
1252	usage()
1253	i = i + 1
1254	else:
1255	usage()
1256
1257	if __name__ == "__main__":
1258	main()

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/libxml2-2.9.4/doc/index.py@ 78404

Download in other formats: