⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 index.py

📁 xml开源解析代码.版本为libxml2-2.6.29,可支持GB3212.网络消息发送XML时很有用.
💻 PY
📖 第 1 页 / 共 3 页
字号:
    file = top.prop("file")    if file == None:        return 0    symbol = top.prop("name")    if symbol == None:        return 0    symbol = string.replace(symbol, "'", " ")    symbol = string.strip(symbol)    info = None    cur = top.children    while cur != None:        if cur.type == 'text':	    cur = cur.next	    continue	if cur.name == "info":	    info = cur.content	elif cur.name == "return":	    rinfo = cur.prop("info")	    if rinfo != None:		rinfo = string.replace(rinfo, "'", " ")		rinfo = string.strip(rinfo)	        addString(rinfo, file, symbol, 7)	elif cur.name == "arg":	    ainfo = cur.prop("info")	    if ainfo != None:		ainfo = string.replace(ainfo, "'", " ")		ainfo = string.strip(ainfo)	        addString(ainfo, file, symbol, 5)	    name = cur.prop("name")	    if name != None:		name = string.replace(name, "'", " ")		name = string.strip(name)	        addWord(name, file, symbol, 7)        cur = cur.next    if info == None:        print "Function %s description has no <info>" % (symbol)	addFunction(symbol, file, "")    else:        info = string.replace(info, "'", " ")	info = string.strip(info)	addFunction(symbol, file, info)        addString(info, file, symbol, 5)    l = splitIdentifier(symbol)    for word in l:	addWord(word, file, symbol, 10)    return 1def analyzeAPISymbols(top):    count = 0    cur = top.children            while cur != None:        if cur.type == 'text':	    cur = cur.next	    continue	if cur.name == "macro":	    count = count + analyzeAPIMacro(cur)	elif cur.name == "function":	    count = count + analyzeAPIFunction(cur)	elif cur.name == "const":	    count = count + analyzeAPIConst(cur)	elif cur.name == "typedef":	    count = count + analyzeAPIType(cur)	elif cur.name == "struct":	    count = count + analyzeAPIStruct(cur)	elif cur.name == "enum":	    count = count + analyzeAPIEnum(cur)	elif cur.name == "functype":	    count = count + analyzeAPIFunctype(cur)	else:	    print "unexpected element %s in API doc <files>" % (cur.name)        cur = cur.next    return countdef analyzeAPI(doc):    count = 0    if doc == None:        return -1    root = doc.getRootElement()    if root.name != "api":        print "Unexpected root name"        return -1    cur = root.children    while cur != None:        if cur.type == 'text':	    cur = cur.next	    continue	if cur.name == "files":	    pass#	    count = count + analyzeAPIFiles(cur)	elif cur.name == "symbols":	    count = count + analyzeAPISymbols(cur)	else:	    print "unexpected element %s in API doc" % (cur.name)        cur = cur.next    return count##########################################################################									##                  Web pages parsing and analysis			##									##########################################################################import globdef analyzeHTMLText(doc, resource, p, section, id):    words = 0    try:	content = p.content	words = words + addStringHTML(content, resource, id, section, 5)    except:        return -1    return wordsdef analyzeHTMLPara(doc, resource, p, section, id):    words = 0    try:	content = p.content	words = words + addStringHTML(content, resource, id, section, 5)    except:        return -1    return wordsdef analyzeHTMLPre(doc, resource, p, section, id):    words = 0    try:	content = p.content	words = words + addStringHTML(content, resource, id, section, 5)    except:        return -1    return wordsdef analyzeHTML(doc, resource, p, section, id):    words = 0    try:	content = p.content	words = words + addStringHTML(content, resource, id, section, 5)    except:        return -1    return wordsdef analyzeHTML(doc, resource):    para = 0;    ctxt = doc.xpathNewContext()    try:	res = ctxt.xpathEval("//head/title")	title = res[0].content    except:        title = "Page %s" % (resource)    addPage(resource, title)    try:	items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")	section = title	id = ""	for item in items:	    if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':	        section = item.content		if item.prop("id"):		    id = item.prop("id")		elif item.prop("name"):		    id = item.prop("name")	    elif item.type == 'text':	        analyzeHTMLText(doc, resource, item, section, id)		para = para + 1	    elif item.name == 'p':	        analyzeHTMLPara(doc, resource, item, section, id)		para = para + 1	    elif item.name == 'pre':	        analyzeHTMLPre(doc, resource, item, section, id)		para = para + 1	    else:	        print "Page %s, unexpected %s element" % (resource, item.name)    except:        print "Page %s: problem analyzing" % (resource)	print sys.exc_type, sys.exc_value    return paradef analyzeHTMLPages():    ret = 0    HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")    for html in HTMLfiles:	if html[0:3] == "API":	    continue	if html == "xml.html":	    continue	try:	    doc = libxml2.parseFile(html)	except:	    doc = libxml2.htmlParseFile(html, None)	try:	    res = analyzeHTML(doc, html)	    print "Parsed %s : %d paragraphs" % (html, res)	    ret = ret + 1	except:	    print "could not parse %s" % (html)    return ret##########################################################################									##                  Mail archives parsing and analysis			##									##########################################################################import timedef getXMLDateArchive(t = None):    if t == None:	t = time.time()    T = time.gmtime(t)    month = time.strftime("%B", T)    year = T[0]    url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)    return urldef scanXMLMsgArchive(url, title, force = 0):    if url == None or title == None:        return 0    ID = checkXMLMsgArchive(url)    if force == 0 and ID != -1:        return 0    if ID == -1:	ID = addXMLMsgArchive(url, title)	if ID == -1:	    return 0    try:        print "Loading %s" % (url)        doc = libxml2.htmlParseFile(url, None);    except:        doc = None    if doc == None:        print "Failed to parse %s" % (url)	return 0    addStringArchive(title, ID, 20)    ctxt = doc.xpathNewContext()    texts = ctxt.xpathEval("//pre//text()")    for text in texts:        addStringArchive(text.content, ID, 5)    return 1def scanXMLDateArchive(t = None, force = 0):    global wordsDictArchive    wordsDictArchive = {}    url = getXMLDateArchive(t)    print "loading %s" % (url)    try:	doc = libxml2.htmlParseFile(url, None);    except:        doc = None    if doc == None:        print "Failed to parse %s" % (url)	return -1    ctxt = doc.xpathNewContext()    anchors = ctxt.xpathEval("//a[@href]")    links = 0    newmsg = 0    for anchor in anchors:	href = anchor.prop("href")	if href == None or href[0:3] != "msg":	    continue        try:	    links = links + 1	    msg = libxml2.buildURI(href, url)	    title = anchor.content	    if title != None and title[0:4] == 'Re: ':	        title = title[4:]	    if title != None and title[0:6] == '[xml] ':	        title = title[6:]	    newmsg = newmsg + scanXMLMsgArchive(msg, title, force)	except:	    pass    return newmsg    ##########################################################################									##          Main code: open the DB, the API XML and analyze it		##									##########################################################################def analyzeArchives(t = None, force = 0):    global wordsDictArchive    ret = scanXMLDateArchive(t, force)    print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)    i = 0    skipped = 0    for word in wordsDictArchive.keys():	refs = wordsDictArchive[word]	if refs  == None:	    skipped = skipped + 1	    continue;	for id in refs.keys():	    relevance = refs[id]	    updateWordArchive(word, id, relevance)	    i = i + 1    print "Found %d associations in HTML pages" % (i)def analyzeHTMLTop():    global wordsDictHTML    ret = analyzeHTMLPages()    print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)    i = 0    skipped = 0    for word in wordsDictHTML.keys():	refs = wordsDictHTML[word]	if refs  == None:	    skipped = skipped + 1	    continue;	for resource in refs.keys():	    (relevance, id, section) = refs[resource]	    updateWordHTML(word, resource, section, id, relevance)	    i = i + 1    print "Found %d associations in HTML pages" % (i)def analyzeAPITop():    global wordsDict    global API    try:	doc = loadAPI(API)	ret = analyzeAPI(doc)	print "Analyzed %d blocs" % (ret)	doc.freeDoc()    except:	print "Failed to parse and analyze %s" % (API)	print sys.exc_type, sys.exc_value	sys.exit(1)    print "Indexed %d words" % (len(wordsDict))    i = 0    skipped = 0    for word in wordsDict.keys():	refs = wordsDict[word]	if refs  == None:	    skipped = skipped + 1	    continue;	for (module, symbol) in refs.keys():	    updateWord(word, symbol, refs[(module, symbol)])	    i = i + 1    print "Found %d associations, skipped %d words" % (i, skipped)def usage():    print "Usage index.py [--force] [--archive]  [--archive-year year] [--archive-month month] [--API] [--docs]"    sys.exit(1)def main():    try:	openMySQL()    except:	print "Failed to open the database"	print sys.exc_type, sys.exc_value	sys.exit(1)    args = sys.argv[1:]    force = 0    if args:        i = 0	while i < len(args):	    if args[i] == '--force':	        force = 1	    elif args[i] == '--archive':	        analyzeArchives(None, force)	    elif args[i] == '--archive-year':	        i = i + 1;		year = args[i]		months = ["January" , "February", "March", "April", "May",			  "June", "July", "August", "September", "October",			  "November", "December"];	        for month in months:		    try:		        str = "%s-%s" % (year, month)			T = time.strptime(str, "%Y-%B")			t = time.mktime(T) + 3600 * 24 * 10;			analyzeArchives(t, force)		    except:			print "Failed to index month archive:"			print sys.exc_type, sys.exc_value	    elif args[i] == '--archive-month':	        i = i + 1;		month = args[i]		try:		    T = time.strptime(month, "%Y-%B")		    t = time.mktime(T) + 3600 * 24 * 10;		    analyzeArchives(t, force)		except:		    print "Failed to index month archive:"		    print sys.exc_type, sys.exc_value	    elif args[i] == '--API':	        analyzeAPITop()	    elif args[i] == '--docs':	        analyzeHTMLTop()	    else:	        usage()	    i = i + 1    else:        usage()if __name__ == "__main__":    main()

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -