VirtualBox

source: vbox/trunk/src/libs/libxml2-2.9.4/doc/index.py@ 82520

Last change on this file since 82520 was 65950, checked in by vboxsync, 8 years ago

libxml 2.9.4: fix export

  • Property svn:eol-style set to LF
  • Property svn:executable set to *
File size: 32.1 KB
Line 
1#!/usr/bin/python -u
2#
3# imports the API description and fills up a database with
4# name relevance to modules, functions or web pages
5#
6# Operation needed:
7# =================
8#
9# install mysqld, the python wrappers for mysql and libxml2, start mysqld
10# Change the root passwd of mysql:
11# mysqladmin -u root password new_password
12# Create the new database xmlsoft
13# mysqladmin -p create xmlsoft
14# Create a database user 'veillard' and give him passord access
15# change veillard and abcde with the right user name and passwd
16# mysql -p
17# password:
18# mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
19# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
20#
21# As the user check the access:
22# mysql -p xmlsoft
23# Enter password:
24# Welcome to the MySQL monitor....
25# mysql> use xmlsoft
26# Database changed
27# mysql> quit
28# Bye
29#
30# Then run the script in the doc subdir, it will create the symbols and
31# word tables and populate them with informations extracted from
32# the libxml2-api.xml API description, and make then accessible read-only
33# by nobody@loaclhost the user expected to be Apache's one
34#
35# On the Apache configuration, make sure you have php support enabled
36#
37
38import MySQLdb
39import libxml2
40import sys
41import string
42import os
43
44#
45# We are not interested in parsing errors here
46#
47def callback(ctx, str):
48 return
49libxml2.registerErrorHandler(callback, None)
50
51#
52# The dictionary of tables required and the SQL command needed
53# to create them
54#
55TABLES={
56 "symbols" : """CREATE TABLE symbols (
57 name varchar(255) BINARY NOT NULL,
58 module varchar(255) BINARY NOT NULL,
59 type varchar(25) NOT NULL,
60 descr varchar(255),
61 UNIQUE KEY name (name),
62 KEY module (module))""",
63 "words" : """CREATE TABLE words (
64 name varchar(50) BINARY NOT NULL,
65 symbol varchar(255) BINARY NOT NULL,
66 relevance int,
67 KEY name (name),
68 KEY symbol (symbol),
69 UNIQUE KEY ID (name, symbol))""",
70 "wordsHTML" : """CREATE TABLE wordsHTML (
71 name varchar(50) BINARY NOT NULL,
72 resource varchar(255) BINARY NOT NULL,
73 section varchar(255),
74 id varchar(50),
75 relevance int,
76 KEY name (name),
77 KEY resource (resource),
78 UNIQUE KEY ref (name, resource))""",
79 "wordsArchive" : """CREATE TABLE wordsArchive (
80 name varchar(50) BINARY NOT NULL,
81 ID int(11) NOT NULL,
82 relevance int,
83 KEY name (name),
84 UNIQUE KEY ref (name, ID))""",
85 "pages" : """CREATE TABLE pages (
86 resource varchar(255) BINARY NOT NULL,
87 title varchar(255) BINARY NOT NULL,
88 UNIQUE KEY name (resource))""",
89 "archives" : """CREATE TABLE archives (
90 ID int(11) NOT NULL auto_increment,
91 resource varchar(255) BINARY NOT NULL,
92 title varchar(255) BINARY NOT NULL,
93 UNIQUE KEY id (ID,resource(255)),
94 INDEX (ID),
95 INDEX (resource))""",
96 "Queries" : """CREATE TABLE Queries (
97 ID int(11) NOT NULL auto_increment,
98 Value varchar(50) NOT NULL,
99 Count int(11) NOT NULL,
100 UNIQUE KEY id (ID,Value(35)),
101 INDEX (ID))""",
102 "AllQueries" : """CREATE TABLE AllQueries (
103 ID int(11) NOT NULL auto_increment,
104 Value varchar(50) NOT NULL,
105 Count int(11) NOT NULL,
106 UNIQUE KEY id (ID,Value(35)),
107 INDEX (ID))""",
108}
109
110#
111# The XML API description file to parse
112#
113API="libxml2-api.xml"
114DB=None
115
116#########################################################################
117# #
118# MySQL database interfaces #
119# #
120#########################################################################
121def createTable(db, name):
122 global TABLES
123
124 if db == None:
125 return -1
126 if name == None:
127 return -1
128 c = db.cursor()
129
130 ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
131 if ret == 1:
132 print "Removed table %s" % (name)
133 print "Creating table %s" % (name)
134 try:
135 ret = c.execute(TABLES[name])
136 except:
137 print "Failed to create table %s" % (name)
138 return -1
139 return ret
140
141def checkTables(db, verbose = 1):
142 global TABLES
143
144 if db == None:
145 return -1
146 c = db.cursor()
147 nbtables = c.execute("show tables")
148 if verbose:
149 print "Found %d tables" % (nbtables)
150 tables = {}
151 i = 0
152 while i < nbtables:
153 l = c.fetchone()
154 name = l[0]
155 tables[name] = {}
156 i = i + 1
157
158 for table in TABLES.keys():
159 if not tables.has_key(table):
160 print "table %s missing" % (table)
161 createTable(db, table)
162 try:
163 ret = c.execute("SELECT count(*) from %s" % table);
164 row = c.fetchone()
165 if verbose:
166 print "Table %s contains %d records" % (table, row[0])
167 except:
168 print "Troubles with table %s : repairing" % (table)
169 ret = c.execute("repair table %s" % table);
170 print "repairing returned %d" % (ret)
171 ret = c.execute("SELECT count(*) from %s" % table);
172 row = c.fetchone()
173 print "Table %s contains %d records" % (table, row[0])
174 if verbose:
175 print "checkTables finished"
176
177 # make sure apache can access the tables read-only
178 try:
179 ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
180 ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost")
181 except:
182 pass
183 return 0
184
185def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
186 global DB
187
188 if passwd == None:
189 try:
190 passwd = os.environ["MySQL_PASS"]
191 except:
192 print "No password available, set environment MySQL_PASS"
193 sys.exit(1)
194
195 DB = MySQLdb.connect(passwd=passwd, db=db)
196 if DB == None:
197 return -1
198 ret = checkTables(DB, verbose)
199 return ret
200
201def updateWord(name, symbol, relevance):
202 global DB
203
204 if DB == None:
205 openMySQL()
206 if DB == None:
207 return -1
208 if name == None:
209 return -1
210 if symbol == None:
211 return -1
212
213 c = DB.cursor()
214 try:
215 ret = c.execute(
216"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
217 (name, symbol, relevance))
218 except:
219 try:
220 ret = c.execute(
221 """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
222 (relevance, name, symbol))
223 except:
224 print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
225 print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
226 print sys.exc_type, sys.exc_value
227 return -1
228
229 return ret
230
231def updateSymbol(name, module, type, desc):
232 global DB
233
234 updateWord(name, name, 50)
235 if DB == None:
236 openMySQL()
237 if DB == None:
238 return -1
239 if name == None:
240 return -1
241 if module == None:
242 return -1
243 if type == None:
244 return -1
245
246 try:
247 desc = string.replace(desc, "'", " ")
248 l = string.split(desc, ".")
249 desc = l[0]
250 desc = desc[0:99]
251 except:
252 desc = ""
253
254 c = DB.cursor()
255 try:
256 ret = c.execute(
257"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
258 (name, module, type, desc))
259 except:
260 try:
261 ret = c.execute(
262"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
263 (module, type, desc, name))
264 except:
265 print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
266 print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
267 print sys.exc_type, sys.exc_value
268 return -1
269
270 return ret
271
272def addFunction(name, module, desc = ""):
273 return updateSymbol(name, module, 'function', desc)
274
275def addMacro(name, module, desc = ""):
276 return updateSymbol(name, module, 'macro', desc)
277
278def addEnum(name, module, desc = ""):
279 return updateSymbol(name, module, 'enum', desc)
280
281def addStruct(name, module, desc = ""):
282 return updateSymbol(name, module, 'struct', desc)
283
284def addConst(name, module, desc = ""):
285 return updateSymbol(name, module, 'const', desc)
286
287def addType(name, module, desc = ""):
288 return updateSymbol(name, module, 'type', desc)
289
290def addFunctype(name, module, desc = ""):
291 return updateSymbol(name, module, 'functype', desc)
292
293def addPage(resource, title):
294 global DB
295
296 if DB == None:
297 openMySQL()
298 if DB == None:
299 return -1
300 if resource == None:
301 return -1
302
303 c = DB.cursor()
304 try:
305 ret = c.execute(
306 """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
307 (resource, title))
308 except:
309 try:
310 ret = c.execute(
311 """UPDATE pages SET title='%s' WHERE resource='%s'""" %
312 (title, resource))
313 except:
314 print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
315 print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
316 print sys.exc_type, sys.exc_value
317 return -1
318
319 return ret
320
321def updateWordHTML(name, resource, desc, id, relevance):
322 global DB
323
324 if DB == None:
325 openMySQL()
326 if DB == None:
327 return -1
328 if name == None:
329 return -1
330 if resource == None:
331 return -1
332 if id == None:
333 id = ""
334 if desc == None:
335 desc = ""
336 else:
337 try:
338 desc = string.replace(desc, "'", " ")
339 desc = desc[0:99]
340 except:
341 desc = ""
342
343 c = DB.cursor()
344 try:
345 ret = c.execute(
346"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
347 (name, resource, desc, id, relevance))
348 except:
349 try:
350 ret = c.execute(
351"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
352 (desc, id, relevance, name, resource))
353 except:
354 print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
355 print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
356 print sys.exc_type, sys.exc_value
357 return -1
358
359 return ret
360
361def checkXMLMsgArchive(url):
362 global DB
363
364 if DB == None:
365 openMySQL()
366 if DB == None:
367 return -1
368 if url == None:
369 return -1
370
371 c = DB.cursor()
372 try:
373 ret = c.execute(
374 """SELECT ID FROM archives WHERE resource='%s'""" % (url))
375 row = c.fetchone()
376 if row == None:
377 return -1
378 except:
379 return -1
380
381 return row[0]
382
383def addXMLMsgArchive(url, title):
384 global DB
385
386 if DB == None:
387 openMySQL()
388 if DB == None:
389 return -1
390 if url == None:
391 return -1
392 if title == None:
393 title = ""
394 else:
395 title = string.replace(title, "'", " ")
396 title = title[0:99]
397
398 c = DB.cursor()
399 try:
400 cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
401 ret = c.execute(cmd)
402 cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
403 ret = c.execute(cmd)
404 row = c.fetchone()
405 if row == None:
406 print "addXMLMsgArchive failed to get the ID: %s" % (url)
407 return -1
408 except:
409 print "addXMLMsgArchive failed command: %s" % (cmd)
410 return -1
411
412 return((int)(row[0]))
413
414def updateWordArchive(name, id, relevance):
415 global DB
416
417 if DB == None:
418 openMySQL()
419 if DB == None:
420 return -1
421 if name == None:
422 return -1
423 if id == None:
424 return -1
425
426 c = DB.cursor()
427 try:
428 ret = c.execute(
429"""INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
430 (name, id, relevance))
431 except:
432 try:
433 ret = c.execute(
434"""UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
435 (relevance, name, id))
436 except:
437 print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
438 print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
439 print sys.exc_type, sys.exc_value
440 return -1
441
442 return ret
443
444#########################################################################
445# #
446# Word dictionary and analysis routines #
447# #
448#########################################################################
449
450#
451# top 100 english word without the one len < 3 + own set
452#
453dropWords = {
454 'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
455 'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
456 'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
457 'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
458 'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
459 'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
460 'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
461 'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
462 'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
463 'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
464 'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
465 'down':0,
466 'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
467}
468
469wordsDict = {}
470wordsDictHTML = {}
471wordsDictArchive = {}
472
473def cleanupWordsString(str):
474 str = string.replace(str, ".", " ")
475 str = string.replace(str, "!", " ")
476 str = string.replace(str, "?", " ")
477 str = string.replace(str, ",", " ")
478 str = string.replace(str, "'", " ")
479 str = string.replace(str, '"', " ")
480 str = string.replace(str, ";", " ")
481 str = string.replace(str, "(", " ")
482 str = string.replace(str, ")", " ")
483 str = string.replace(str, "{", " ")
484 str = string.replace(str, "}", " ")
485 str = string.replace(str, "<", " ")
486 str = string.replace(str, ">", " ")
487 str = string.replace(str, "=", " ")
488 str = string.replace(str, "/", " ")
489 str = string.replace(str, "*", " ")
490 str = string.replace(str, ":", " ")
491 str = string.replace(str, "#", " ")
492 str = string.replace(str, "\\", " ")
493 str = string.replace(str, "\n", " ")
494 str = string.replace(str, "\r", " ")
495 str = string.replace(str, "\xc2", " ")
496 str = string.replace(str, "\xa0", " ")
497 return str
498
499def cleanupDescrString(str):
500 str = string.replace(str, "'", " ")
501 str = string.replace(str, "\n", " ")
502 str = string.replace(str, "\r", " ")
503 str = string.replace(str, "\xc2", " ")
504 str = string.replace(str, "\xa0", " ")
505 l = string.split(str)
506 str = string.join(str)
507 return str
508
509def splitIdentifier(str):
510 ret = []
511 while str != "":
512 cur = string.lower(str[0])
513 str = str[1:]
514 if ((cur < 'a') or (cur > 'z')):
515 continue
516 while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
517 cur = cur + string.lower(str[0])
518 str = str[1:]
519 while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
520 cur = cur + str[0]
521 str = str[1:]
522 while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
523 str = str[1:]
524 ret.append(cur)
525 return ret
526
527def addWord(word, module, symbol, relevance):
528 global wordsDict
529
530 if word == None or len(word) < 3:
531 return -1
532 if module == None or symbol == None:
533 return -1
534 if dropWords.has_key(word):
535 return 0
536 if ord(word[0]) > 0x80:
537 return 0
538
539 if wordsDict.has_key(word):
540 d = wordsDict[word]
541 if d == None:
542 return 0
543 if len(d) > 500:
544 wordsDict[word] = None
545 return 0
546 try:
547 relevance = relevance + d[(module, symbol)]
548 except:
549 pass
550 else:
551 wordsDict[word] = {}
552 wordsDict[word][(module, symbol)] = relevance
553 return relevance
554
555def addString(str, module, symbol, relevance):
556 if str == None or len(str) < 3:
557 return -1
558 ret = 0
559 str = cleanupWordsString(str)
560 l = string.split(str)
561 for word in l:
562 if len(word) > 2:
563 ret = ret + addWord(word, module, symbol, 5)
564
565 return ret
566
567def addWordHTML(word, resource, id, section, relevance):
568 global wordsDictHTML
569
570 if word == None or len(word) < 3:
571 return -1
572 if resource == None or section == None:
573 return -1
574 if dropWords.has_key(word):
575 return 0
576 if ord(word[0]) > 0x80:
577 return 0
578
579 section = cleanupDescrString(section)
580
581 if wordsDictHTML.has_key(word):
582 d = wordsDictHTML[word]
583 if d == None:
584 print "skipped %s" % (word)
585 return 0
586 try:
587 (r,i,s) = d[resource]
588 if i != None:
589 id = i
590 if s != None:
591 section = s
592 relevance = relevance + r
593 except:
594 pass
595 else:
596 wordsDictHTML[word] = {}
597 d = wordsDictHTML[word];
598 d[resource] = (relevance, id, section)
599 return relevance
600
601def addStringHTML(str, resource, id, section, relevance):
602 if str == None or len(str) < 3:
603 return -1
604 ret = 0
605 str = cleanupWordsString(str)
606 l = string.split(str)
607 for word in l:
608 if len(word) > 2:
609 try:
610 r = addWordHTML(word, resource, id, section, relevance)
611 if r < 0:
612 print "addWordHTML failed: %s %s" % (word, resource)
613 ret = ret + r
614 except:
615 print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
616 print sys.exc_type, sys.exc_value
617
618 return ret
619
620def addWordArchive(word, id, relevance):
621 global wordsDictArchive
622
623 if word == None or len(word) < 3:
624 return -1
625 if id == None or id == -1:
626 return -1
627 if dropWords.has_key(word):
628 return 0
629 if ord(word[0]) > 0x80:
630 return 0
631
632 if wordsDictArchive.has_key(word):
633 d = wordsDictArchive[word]
634 if d == None:
635 print "skipped %s" % (word)
636 return 0
637 try:
638 r = d[id]
639 relevance = relevance + r
640 except:
641 pass
642 else:
643 wordsDictArchive[word] = {}
644 d = wordsDictArchive[word];
645 d[id] = relevance
646 return relevance
647
648def addStringArchive(str, id, relevance):
649 if str == None or len(str) < 3:
650 return -1
651 ret = 0
652 str = cleanupWordsString(str)
653 l = string.split(str)
654 for word in l:
655 i = len(word)
656 if i > 2:
657 try:
658 r = addWordArchive(word, id, relevance)
659 if r < 0:
660 print "addWordArchive failed: %s %s" % (word, id)
661 else:
662 ret = ret + r
663 except:
664 print "addWordArchive failed: %s %s %d" % (word, id, relevance)
665 print sys.exc_type, sys.exc_value
666 return ret
667
668#########################################################################
669# #
670# XML API description analysis #
671# #
672#########################################################################
673
674def loadAPI(filename):
675 doc = libxml2.parseFile(filename)
676 print "loaded %s" % (filename)
677 return doc
678
679def foundExport(file, symbol):
680 if file == None:
681 return 0
682 if symbol == None:
683 return 0
684 addFunction(symbol, file)
685 l = splitIdentifier(symbol)
686 for word in l:
687 addWord(word, file, symbol, 10)
688 return 1
689
690def analyzeAPIFile(top):
691 count = 0
692 name = top.prop("name")
693 cur = top.children
694 while cur != None:
695 if cur.type == 'text':
696 cur = cur.next
697 continue
698 if cur.name == "exports":
699 count = count + foundExport(name, cur.prop("symbol"))
700 else:
701 print "unexpected element %s in API doc <file name='%s'>" % (name)
702 cur = cur.next
703 return count
704
705def analyzeAPIFiles(top):
706 count = 0
707 cur = top.children
708
709 while cur != None:
710 if cur.type == 'text':
711 cur = cur.next
712 continue
713 if cur.name == "file":
714 count = count + analyzeAPIFile(cur)
715 else:
716 print "unexpected element %s in API doc <files>" % (cur.name)
717 cur = cur.next
718 return count
719
720def analyzeAPIEnum(top):
721 file = top.prop("file")
722 if file == None:
723 return 0
724 symbol = top.prop("name")
725 if symbol == None:
726 return 0
727
728 addEnum(symbol, file)
729 l = splitIdentifier(symbol)
730 for word in l:
731 addWord(word, file, symbol, 10)
732
733 return 1
734
735def analyzeAPIConst(top):
736 file = top.prop("file")
737 if file == None:
738 return 0
739 symbol = top.prop("name")
740 if symbol == None:
741 return 0
742
743 addConst(symbol, file)
744 l = splitIdentifier(symbol)
745 for word in l:
746 addWord(word, file, symbol, 10)
747
748 return 1
749
750def analyzeAPIType(top):
751 file = top.prop("file")
752 if file == None:
753 return 0
754 symbol = top.prop("name")
755 if symbol == None:
756 return 0
757
758 addType(symbol, file)
759 l = splitIdentifier(symbol)
760 for word in l:
761 addWord(word, file, symbol, 10)
762 return 1
763
764def analyzeAPIFunctype(top):
765 file = top.prop("file")
766 if file == None:
767 return 0
768 symbol = top.prop("name")
769 if symbol == None:
770 return 0
771
772 addFunctype(symbol, file)
773 l = splitIdentifier(symbol)
774 for word in l:
775 addWord(word, file, symbol, 10)
776 return 1
777
778def analyzeAPIStruct(top):
779 file = top.prop("file")
780 if file == None:
781 return 0
782 symbol = top.prop("name")
783 if symbol == None:
784 return 0
785
786 addStruct(symbol, file)
787 l = splitIdentifier(symbol)
788 for word in l:
789 addWord(word, file, symbol, 10)
790
791 info = top.prop("info")
792 if info != None:
793 info = string.replace(info, "'", " ")
794 info = string.strip(info)
795 l = string.split(info)
796 for word in l:
797 if len(word) > 2:
798 addWord(word, file, symbol, 5)
799 return 1
800
801def analyzeAPIMacro(top):
802 file = top.prop("file")
803 if file == None:
804 return 0
805 symbol = top.prop("name")
806 if symbol == None:
807 return 0
808 symbol = string.replace(symbol, "'", " ")
809 symbol = string.strip(symbol)
810
811 info = None
812 cur = top.children
813 while cur != None:
814 if cur.type == 'text':
815 cur = cur.next
816 continue
817 if cur.name == "info":
818 info = cur.content
819 break
820 cur = cur.next
821
822 l = splitIdentifier(symbol)
823 for word in l:
824 addWord(word, file, symbol, 10)
825
826 if info == None:
827 addMacro(symbol, file)
828 print "Macro %s description has no <info>" % (symbol)
829 return 0
830
831 info = string.replace(info, "'", " ")
832 info = string.strip(info)
833 addMacro(symbol, file, info)
834 l = string.split(info)
835 for word in l:
836 if len(word) > 2:
837 addWord(word, file, symbol, 5)
838 return 1
839
840def analyzeAPIFunction(top):
841 file = top.prop("file")
842 if file == None:
843 return 0
844 symbol = top.prop("name")
845 if symbol == None:
846 return 0
847
848 symbol = string.replace(symbol, "'", " ")
849 symbol = string.strip(symbol)
850 info = None
851 cur = top.children
852 while cur != None:
853 if cur.type == 'text':
854 cur = cur.next
855 continue
856 if cur.name == "info":
857 info = cur.content
858 elif cur.name == "return":
859 rinfo = cur.prop("info")
860 if rinfo != None:
861 rinfo = string.replace(rinfo, "'", " ")
862 rinfo = string.strip(rinfo)
863 addString(rinfo, file, symbol, 7)
864 elif cur.name == "arg":
865 ainfo = cur.prop("info")
866 if ainfo != None:
867 ainfo = string.replace(ainfo, "'", " ")
868 ainfo = string.strip(ainfo)
869 addString(ainfo, file, symbol, 5)
870 name = cur.prop("name")
871 if name != None:
872 name = string.replace(name, "'", " ")
873 name = string.strip(name)
874 addWord(name, file, symbol, 7)
875 cur = cur.next
876 if info == None:
877 print "Function %s description has no <info>" % (symbol)
878 addFunction(symbol, file, "")
879 else:
880 info = string.replace(info, "'", " ")
881 info = string.strip(info)
882 addFunction(symbol, file, info)
883 addString(info, file, symbol, 5)
884
885 l = splitIdentifier(symbol)
886 for word in l:
887 addWord(word, file, symbol, 10)
888
889 return 1
890
891def analyzeAPISymbols(top):
892 count = 0
893 cur = top.children
894
895 while cur != None:
896 if cur.type == 'text':
897 cur = cur.next
898 continue
899 if cur.name == "macro":
900 count = count + analyzeAPIMacro(cur)
901 elif cur.name == "function":
902 count = count + analyzeAPIFunction(cur)
903 elif cur.name == "const":
904 count = count + analyzeAPIConst(cur)
905 elif cur.name == "typedef":
906 count = count + analyzeAPIType(cur)
907 elif cur.name == "struct":
908 count = count + analyzeAPIStruct(cur)
909 elif cur.name == "enum":
910 count = count + analyzeAPIEnum(cur)
911 elif cur.name == "functype":
912 count = count + analyzeAPIFunctype(cur)
913 else:
914 print "unexpected element %s in API doc <files>" % (cur.name)
915 cur = cur.next
916 return count
917
918def analyzeAPI(doc):
919 count = 0
920 if doc == None:
921 return -1
922 root = doc.getRootElement()
923 if root.name != "api":
924 print "Unexpected root name"
925 return -1
926 cur = root.children
927 while cur != None:
928 if cur.type == 'text':
929 cur = cur.next
930 continue
931 if cur.name == "files":
932 pass
933# count = count + analyzeAPIFiles(cur)
934 elif cur.name == "symbols":
935 count = count + analyzeAPISymbols(cur)
936 else:
937 print "unexpected element %s in API doc" % (cur.name)
938 cur = cur.next
939 return count
940
941#########################################################################
942# #
943# Web pages parsing and analysis #
944# #
945#########################################################################
946
947import glob
948
949def analyzeHTMLText(doc, resource, p, section, id):
950 words = 0
951 try:
952 content = p.content
953 words = words + addStringHTML(content, resource, id, section, 5)
954 except:
955 return -1
956 return words
957
958def analyzeHTMLPara(doc, resource, p, section, id):
959 words = 0
960 try:
961 content = p.content
962 words = words + addStringHTML(content, resource, id, section, 5)
963 except:
964 return -1
965 return words
966
967def analyzeHTMLPre(doc, resource, p, section, id):
968 words = 0
969 try:
970 content = p.content
971 words = words + addStringHTML(content, resource, id, section, 5)
972 except:
973 return -1
974 return words
975
976def analyzeHTML(doc, resource, p, section, id):
977 words = 0
978 try:
979 content = p.content
980 words = words + addStringHTML(content, resource, id, section, 5)
981 except:
982 return -1
983 return words
984
985def analyzeHTML(doc, resource):
986 para = 0;
987 ctxt = doc.xpathNewContext()
988 try:
989 res = ctxt.xpathEval("//head/title")
990 title = res[0].content
991 except:
992 title = "Page %s" % (resource)
993 addPage(resource, title)
994 try:
995 items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
996 section = title
997 id = ""
998 for item in items:
999 if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
1000 section = item.content
1001 if item.prop("id"):
1002 id = item.prop("id")
1003 elif item.prop("name"):
1004 id = item.prop("name")
1005 elif item.type == 'text':
1006 analyzeHTMLText(doc, resource, item, section, id)
1007 para = para + 1
1008 elif item.name == 'p':
1009 analyzeHTMLPara(doc, resource, item, section, id)
1010 para = para + 1
1011 elif item.name == 'pre':
1012 analyzeHTMLPre(doc, resource, item, section, id)
1013 para = para + 1
1014 else:
1015 print "Page %s, unexpected %s element" % (resource, item.name)
1016 except:
1017 print "Page %s: problem analyzing" % (resource)
1018 print sys.exc_type, sys.exc_value
1019
1020 return para
1021
1022def analyzeHTMLPages():
1023 ret = 0
1024 HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
1025 for html in HTMLfiles:
1026 if html[0:3] == "API":
1027 continue
1028 if html == "xml.html":
1029 continue
1030 try:
1031 doc = libxml2.parseFile(html)
1032 except:
1033 doc = libxml2.htmlParseFile(html, None)
1034 try:
1035 res = analyzeHTML(doc, html)
1036 print "Parsed %s : %d paragraphs" % (html, res)
1037 ret = ret + 1
1038 except:
1039 print "could not parse %s" % (html)
1040 return ret
1041
1042#########################################################################
1043# #
1044# Mail archives parsing and analysis #
1045# #
1046#########################################################################
1047
1048import time
1049
1050def getXMLDateArchive(t = None):
1051 if t == None:
1052 t = time.time()
1053 T = time.gmtime(t)
1054 month = time.strftime("%B", T)
1055 year = T[0]
1056 url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
1057 return url
1058
1059def scanXMLMsgArchive(url, title, force = 0):
1060 if url == None or title == None:
1061 return 0
1062
1063 ID = checkXMLMsgArchive(url)
1064 if force == 0 and ID != -1:
1065 return 0
1066
1067 if ID == -1:
1068 ID = addXMLMsgArchive(url, title)
1069 if ID == -1:
1070 return 0
1071
1072 try:
1073 print "Loading %s" % (url)
1074 doc = libxml2.htmlParseFile(url, None);
1075 except:
1076 doc = None
1077 if doc == None:
1078 print "Failed to parse %s" % (url)
1079 return 0
1080
1081 addStringArchive(title, ID, 20)
1082 ctxt = doc.xpathNewContext()
1083 texts = ctxt.xpathEval("//pre//text()")
1084 for text in texts:
1085 addStringArchive(text.content, ID, 5)
1086
1087 return 1
1088
1089def scanXMLDateArchive(t = None, force = 0):
1090 global wordsDictArchive
1091
1092 wordsDictArchive = {}
1093
1094 url = getXMLDateArchive(t)
1095 print "loading %s" % (url)
1096 try:
1097 doc = libxml2.htmlParseFile(url, None);
1098 except:
1099 doc = None
1100 if doc == None:
1101 print "Failed to parse %s" % (url)
1102 return -1
1103 ctxt = doc.xpathNewContext()
1104 anchors = ctxt.xpathEval("//a[@href]")
1105 links = 0
1106 newmsg = 0
1107 for anchor in anchors:
1108 href = anchor.prop("href")
1109 if href == None or href[0:3] != "msg":
1110 continue
1111 try:
1112 links = links + 1
1113
1114 msg = libxml2.buildURI(href, url)
1115 title = anchor.content
1116 if title != None and title[0:4] == 'Re: ':
1117 title = title[4:]
1118 if title != None and title[0:6] == '[xml] ':
1119 title = title[6:]
1120 newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
1121
1122 except:
1123 pass
1124
1125 return newmsg
1126
1127
1128#########################################################################
1129# #
1130# Main code: open the DB, the API XML and analyze it #
1131# #
1132#########################################################################
1133def analyzeArchives(t = None, force = 0):
1134 global wordsDictArchive
1135
1136 ret = scanXMLDateArchive(t, force)
1137 print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
1138
1139 i = 0
1140 skipped = 0
1141 for word in wordsDictArchive.keys():
1142 refs = wordsDictArchive[word]
1143 if refs == None:
1144 skipped = skipped + 1
1145 continue;
1146 for id in refs.keys():
1147 relevance = refs[id]
1148 updateWordArchive(word, id, relevance)
1149 i = i + 1
1150
1151 print "Found %d associations in HTML pages" % (i)
1152
1153def analyzeHTMLTop():
1154 global wordsDictHTML
1155
1156 ret = analyzeHTMLPages()
1157 print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
1158
1159 i = 0
1160 skipped = 0
1161 for word in wordsDictHTML.keys():
1162 refs = wordsDictHTML[word]
1163 if refs == None:
1164 skipped = skipped + 1
1165 continue;
1166 for resource in refs.keys():
1167 (relevance, id, section) = refs[resource]
1168 updateWordHTML(word, resource, section, id, relevance)
1169 i = i + 1
1170
1171 print "Found %d associations in HTML pages" % (i)
1172
1173def analyzeAPITop():
1174 global wordsDict
1175 global API
1176
1177 try:
1178 doc = loadAPI(API)
1179 ret = analyzeAPI(doc)
1180 print "Analyzed %d blocs" % (ret)
1181 doc.freeDoc()
1182 except:
1183 print "Failed to parse and analyze %s" % (API)
1184 print sys.exc_type, sys.exc_value
1185 sys.exit(1)
1186
1187 print "Indexed %d words" % (len(wordsDict))
1188 i = 0
1189 skipped = 0
1190 for word in wordsDict.keys():
1191 refs = wordsDict[word]
1192 if refs == None:
1193 skipped = skipped + 1
1194 continue;
1195 for (module, symbol) in refs.keys():
1196 updateWord(word, symbol, refs[(module, symbol)])
1197 i = i + 1
1198
1199 print "Found %d associations, skipped %d words" % (i, skipped)
1200
1201def usage():
1202 print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]"
1203 sys.exit(1)
1204
1205def main():
1206 try:
1207 openMySQL()
1208 except:
1209 print "Failed to open the database"
1210 print sys.exc_type, sys.exc_value
1211 sys.exit(1)
1212
1213 args = sys.argv[1:]
1214 force = 0
1215 if args:
1216 i = 0
1217 while i < len(args):
1218 if args[i] == '--force':
1219 force = 1
1220 elif args[i] == '--archive':
1221 analyzeArchives(None, force)
1222 elif args[i] == '--archive-year':
1223 i = i + 1;
1224 year = args[i]
1225 months = ["January" , "February", "March", "April", "May",
1226 "June", "July", "August", "September", "October",
1227 "November", "December"];
1228 for month in months:
1229 try:
1230 str = "%s-%s" % (year, month)
1231 T = time.strptime(str, "%Y-%B")
1232 t = time.mktime(T) + 3600 * 24 * 10;
1233 analyzeArchives(t, force)
1234 except:
1235 print "Failed to index month archive:"
1236 print sys.exc_type, sys.exc_value
1237 elif args[i] == '--archive-month':
1238 i = i + 1;
1239 month = args[i]
1240 try:
1241 T = time.strptime(month, "%Y-%B")
1242 t = time.mktime(T) + 3600 * 24 * 10;
1243 analyzeArchives(t, force)
1244 except:
1245 print "Failed to index month archive:"
1246 print sys.exc_type, sys.exc_value
1247 elif args[i] == '--API':
1248 analyzeAPITop()
1249 elif args[i] == '--docs':
1250 analyzeHTMLTop()
1251 else:
1252 usage()
1253 i = i + 1
1254 else:
1255 usage()
1256
1257if __name__ == "__main__":
1258 main()
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette