📄 index.py
字号:
file = top.prop("file") if file == None: return 0 symbol = top.prop("name") if symbol == None: return 0 symbol = string.replace(symbol, "'", " ") symbol = string.strip(symbol) info = None cur = top.children while cur != None: if cur.type == 'text': cur = cur.next continue if cur.name == "info": info = cur.content elif cur.name == "return": rinfo = cur.prop("info") if rinfo != None: rinfo = string.replace(rinfo, "'", " ") rinfo = string.strip(rinfo) addString(rinfo, file, symbol, 7) elif cur.name == "arg": ainfo = cur.prop("info") if ainfo != None: ainfo = string.replace(ainfo, "'", " ") ainfo = string.strip(ainfo) addString(ainfo, file, symbol, 5) name = cur.prop("name") if name != None: name = string.replace(name, "'", " ") name = string.strip(name) addWord(name, file, symbol, 7) cur = cur.next if info == None: print "Function %s description has no <info>" % (symbol) addFunction(symbol, file, "") else: info = string.replace(info, "'", " ") info = string.strip(info) addFunction(symbol, file, info) addString(info, file, symbol, 5) l = splitIdentifier(symbol) for word in l: addWord(word, file, symbol, 10) return 1def analyzeAPISymbols(top): count = 0 cur = top.children while cur != None: if cur.type == 'text': cur = cur.next continue if cur.name == "macro": count = count + analyzeAPIMacro(cur) elif cur.name == "function": count = count + analyzeAPIFunction(cur) elif cur.name == "const": count = count + analyzeAPIConst(cur) elif cur.name == "typedef": count = count + analyzeAPIType(cur) elif cur.name == "struct": count = count + analyzeAPIStruct(cur) elif cur.name == "enum": count = count + analyzeAPIEnum(cur) elif cur.name == "functype": count = count + analyzeAPIFunctype(cur) else: print "unexpected element %s in API doc <files>" % (cur.name) cur = cur.next return countdef analyzeAPI(doc): count = 0 if doc == None: return -1 root = doc.getRootElement() if root.name != "api": print "Unexpected root name" return -1 cur = root.children while cur != None: if cur.type == 'text': cur = cur.next continue if cur.name == "files": pass# count = count + analyzeAPIFiles(cur) elif cur.name == "symbols": count = count + analyzeAPISymbols(cur) else: print "unexpected element %s in API doc" % (cur.name) cur = cur.next return count########################################################################## ## Web pages parsing and analysis ## ##########################################################################import globdef analyzeHTMLText(doc, resource, p, section, id): words = 0 try: content = p.content words = words + addStringHTML(content, resource, id, section, 5) except: return -1 return wordsdef analyzeHTMLPara(doc, resource, p, section, id): words = 0 try: content = p.content words = words + addStringHTML(content, resource, id, section, 5) except: return -1 return wordsdef analyzeHTMLPre(doc, resource, p, section, id): words = 0 try: content = p.content words = words + addStringHTML(content, resource, id, section, 5) except: return -1 return wordsdef analyzeHTML(doc, resource, p, section, id): words = 0 try: content = p.content words = words + addStringHTML(content, resource, id, section, 5) except: return -1 return wordsdef analyzeHTML(doc, resource): para = 0; ctxt = doc.xpathNewContext() try: res = ctxt.xpathEval("//head/title") title = res[0].content except: title = "Page %s" % (resource) addPage(resource, title) try: items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()") section = title id = "" for item in items: if item.name == 'h1' or item.name == 'h2' or item.name == 'h3': section = item.content if item.prop("id"): id = item.prop("id") elif item.prop("name"): id = item.prop("name") elif item.type == 'text': analyzeHTMLText(doc, resource, item, section, id) para = para + 1 elif item.name == 'p': analyzeHTMLPara(doc, resource, item, section, id) para = para + 1 elif item.name == 'pre': analyzeHTMLPre(doc, resource, item, section, id) para = para + 1 else: print "Page %s, unexpected %s element" % (resource, item.name) except: print "Page %s: problem analyzing" % (resource) print sys.exc_type, sys.exc_value return paradef analyzeHTMLPages(): ret = 0 HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html") for html in HTMLfiles: if html[0:3] == "API": continue if html == "xml.html": continue try: doc = libxml2.parseFile(html) except: doc = libxml2.htmlParseFile(html, None) try: res = analyzeHTML(doc, html) print "Parsed %s : %d paragraphs" % (html, res) ret = ret + 1 except: print "could not parse %s" % (html) return ret########################################################################## ## Mail archives parsing and analysis ## ##########################################################################import timedef getXMLDateArchive(t = None): if t == None: t = time.time() T = time.gmtime(t) month = time.strftime("%B", T) year = T[0] url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month) return urldef scanXMLMsgArchive(url, title, force = 0): if url == None or title == None: return 0 ID = checkXMLMsgArchive(url) if force == 0 and ID != -1: return 0 if ID == -1: ID = addXMLMsgArchive(url, title) if ID == -1: return 0 try: print "Loading %s" % (url) doc = libxml2.htmlParseFile(url, None); except: doc = None if doc == None: print "Failed to parse %s" % (url) return 0 addStringArchive(title, ID, 20) ctxt = doc.xpathNewContext() texts = ctxt.xpathEval("//pre//text()") for text in texts: addStringArchive(text.content, ID, 5) return 1def scanXMLDateArchive(t = None, force = 0): global wordsDictArchive wordsDictArchive = {} url = getXMLDateArchive(t) print "loading %s" % (url) try: doc = libxml2.htmlParseFile(url, None); except: doc = None if doc == None: print "Failed to parse %s" % (url) return -1 ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval("//a[@href]") links = 0 newmsg = 0 for anchor in anchors: href = anchor.prop("href") if href == None or href[0:3] != "msg": continue try: links = links + 1 msg = libxml2.buildURI(href, url) title = anchor.content if title != None and title[0:4] == 'Re: ': title = title[4:] if title != None and title[0:6] == '[xml] ': title = title[6:] newmsg = newmsg + scanXMLMsgArchive(msg, title, force) except: pass return newmsg ########################################################################## ## Main code: open the DB, the API XML and analyze it ## ##########################################################################def analyzeArchives(t = None, force = 0): global wordsDictArchive ret = scanXMLDateArchive(t, force) print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret) i = 0 skipped = 0 for word in wordsDictArchive.keys(): refs = wordsDictArchive[word] if refs == None: skipped = skipped + 1 continue; for id in refs.keys(): relevance = refs[id] updateWordArchive(word, id, relevance) i = i + 1 print "Found %d associations in HTML pages" % (i)def analyzeHTMLTop(): global wordsDictHTML ret = analyzeHTMLPages() print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret) i = 0 skipped = 0 for word in wordsDictHTML.keys(): refs = wordsDictHTML[word] if refs == None: skipped = skipped + 1 continue; for resource in refs.keys(): (relevance, id, section) = refs[resource] updateWordHTML(word, resource, section, id, relevance) i = i + 1 print "Found %d associations in HTML pages" % (i)def analyzeAPITop(): global wordsDict global API try: doc = loadAPI(API) ret = analyzeAPI(doc) print "Analyzed %d blocs" % (ret) doc.freeDoc() except: print "Failed to parse and analyze %s" % (API) print sys.exc_type, sys.exc_value sys.exit(1) print "Indexed %d words" % (len(wordsDict)) i = 0 skipped = 0 for word in wordsDict.keys(): refs = wordsDict[word] if refs == None: skipped = skipped + 1 continue; for (module, symbol) in refs.keys(): updateWord(word, symbol, refs[(module, symbol)]) i = i + 1 print "Found %d associations, skipped %d words" % (i, skipped)def usage(): print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]" sys.exit(1)def main(): try: openMySQL() except: print "Failed to open the database" print sys.exc_type, sys.exc_value sys.exit(1) args = sys.argv[1:] force = 0 if args: i = 0 while i < len(args): if args[i] == '--force': force = 1 elif args[i] == '--archive': analyzeArchives(None, force) elif args[i] == '--archive-year': i = i + 1; year = args[i] months = ["January" , "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]; for month in months: try: str = "%s-%s" % (year, month) T = time.strptime(str, "%Y-%B") t = time.mktime(T) + 3600 * 24 * 10; analyzeArchives(t, force) except: print "Failed to index month archive:" print sys.exc_type, sys.exc_value elif args[i] == '--archive-month': i = i + 1; month = args[i] try: T = time.strptime(month, "%Y-%B") t = time.mktime(T) + 3600 * 24 * 10; analyzeArchives(t, force) except: print "Failed to index month archive:" print sys.exc_type, sys.exc_value elif args[i] == '--API': analyzeAPITop() elif args[i] == '--docs': analyzeHTMLTop() else: usage() i = i + 1 else: usage()if __name__ == "__main__": main()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -