📄 web_parser.py
字号:
import sys, re; r_iso = re.compile('([\x80-\xFF])')
import HTMLParser
import string
import urllib2
import os
import sys
def iso2utf(s):
def conv(m):
c = m.group(0)
return ('\xC2'+c, '\xC3'+chr(ord(c) - 64))[ord(c) > 0xBF]
return r_iso.sub(conv, s)
def save_web_content(url, filename):
usock = urllib2.urlopen(url)
data = usock.read()
# raw_input("enter")
usock.close()
fp = open(filename, 'wb')
#data = data.encode('utf-8')
fp.write(iso2utf(data))
fp.close()
def web_content_parser():
file = open("web.txt","r")
outName = "web_contentX.txt"
index = 1
while 1:
line = file.readline()
if not line:
break
#print line
if line.count('<td width=80 align="left"><font face="Verdana" size=1>Source<br>Updated</td>') > 0:
# print line
line = file.readline() #ignore </tr>
fileOutName = string.replace(outName,"X",str(index))
# print fileOutName
# raw_input("enter")
file_out = open(fileOutName,"w")
while 1:
line = file.readline()
if line.count("</table>") > 0:
# raw_input("table")
break
file_out.write(line)
file_out.flush()
index = index + 1
file_out.close()
return index - 1
class WeaterHTMLParser(HTMLParser.HTMLParser):
def handle_data(self, data):
if len(data) != 1:
item.append(data)
def unknown_decl(self, data):
"""Override unknown handle method to avoid exception"""
pass
def feed(self, data):
"""Feed data to the parser.
Call this as often as you want, with as little or as much text
as you want (may include '\n').
"""
self.rawdata = self.rawdata + data
returnValue = self.goahead(0)
#------Step 1----------save the HTML content on url to local disk-------------------#
#url = "http://www.lyngsat.com/astra23.html"
url = sys.argv[1]
filename = "web.txt"
save_web_content(url,filename)
#------Step 2--------parse and devide the HTML content to seperate files----------#
fileNum = web_content_parser()
#------Step 3-------for each file parsed in step 2, we extract the information we want-------#
Parser = WeaterHTMLParser()
PAL_dic = {"Freq.Tp":"None","Provider Name Channel Name":"None","Video Encryption":"PAL","Audio":"None"}
DVB_dic_1 = {"Freq.Tp":"None","Provider Name Channel Name":"None","Video Encryption":"DVB","SR-FEC":"None","NID-TID":"None"}
DVB_dic_2 = {"Freq.Tp":"None","Provider Name Channel Name":"None","Video Encryption":"DVB","SID-VPID":"None","Audio":"None"}
DVBS2_dic_1 = {"Freq.Tp":"None","Provider Name Channel Name":"None","Video Encryption":"DVB-S2","SR-FEC":"None","NID-TID":"None"}
DVBS2_dic_2 = {"Freq.Tp":"None","Provider Name Channel Name":"None","Video Encryption":"DVB-S2","SID-VPID":"None","Audio":"None"}
inputName = "web_contentX.txt"
outputName = "LyngSat_X.txt"
database = "dataBaseX.txt"
resultName = "database.txt"
List = []
TotalList = []
combine = ""
rowspan_count = 0
rowspan_ori = 0
PAL_count = 0
rowspan_flag = 1
os.mkdir("other output")
resultFile = open(resultName,"w")
for index in range(1,fileNum+1):
fileInputName = string.replace(inputName,"X",str(index))
file = open(fileInputName,"r")
fileOutPutName = string.replace(outputName,"X",str(index))
outPutFile = open(fileOutPutName,"w")
databaseName = string.replace(database,"X",str(index))
fileDatabase = open(databaseName,"w")
while 1:
item = []
line = file.readline()
#print line
if not line:
break
if line.count("<tr>") > 0:
List = []
if line.count("td rowspan=") > 0 and rowspan_flag == 1:
#raw_input('enter td rowspan')
if line[13] != " ":
rowspan_ori = eval(line[12:14])
else:
rowspan_ori = eval(line[12])
#print rowspan_ori
rowspan_count = rowspan_ori
rowspan_flag = 0
if line.count("/tr") > 0:
#print "rowspan_count"
#print rowspan_count
#print "rowspan_ori"
#print rowspan_ori
#print "PAL_count"
#print PAL_count
#raw_input('take a look')
if len(List) < 4:
continue
if List.count("PAL") > 0 and rowspan_count == rowspan_ori:
PAL_count = rowspan_ori
PAL_dic["Freq.Tp"] = List[1]
PAL_dic["Provider Name Channel Name"] = List[3]
PAL_dic["Audio"] = List[6]
#print PAL_dic
#raw_input('List.count("PAL") > 0 and rowspan_count == rowspan_ori ')
if rowspan_count != 0:
rowspan_count = rowspan_count - 1
if PAL_count != 0:
PAL_count = PAL_count - 1
outPutFile.write("Freq.Tp:")
outPutFile.write(PAL_dic["Freq.Tp"])
outPutFile.write(" ")
outPutFile.write("Provider Name Channel Name:")
outPutFile.write(PAL_dic["Provider Name Channel Name"])
outPutFile.write(" ")
outPutFile.write("Video Encryption:")
outPutFile.write(PAL_dic["Video Encryption"])
outPutFile.write(" ")
outPutFile.write("Audio:")
outPutFile.write(PAL_dic["Audio"])
outPutFile.write("\n")
elif List.count("PAL") > 0 and rowspan_count != rowspan_ori:
PAL_dic["Provider Name Channel Name"] = List[2]
PAL_dic["Audio"] = List[5]
#print PAL_dic
#raw_input('List.count("PAL") > 0 and rowspan_count != rowspan_ori ')
if rowspan_count != 0:
rowspan_count = rowspan_count - 1
if PAL_count != 0:
PAL_count = PAL_count - 1
outPutFile.write("Freq.Tp:")
outPutFile.write(PAL_dic["Freq.Tp"])
outPutFile.write(" ")
outPutFile.write("Provider Name Channel Name:")
outPutFile.write(PAL_dic["Provider Name Channel Name"])
outPutFile.write(" ")
outPutFile.write("Video Encryption:")
outPutFile.write(PAL_dic["Video Encryption"])
outPutFile.write(" ")
outPutFile.write("Audio:")
outPutFile.write(PAL_dic["Audio"])
outPutFile.write("\n")
elif List.count("PAL") == 0 and rowspan_count != rowspan_ori and PAL_count != 0 and len(List) == 4:
PAL_dic["Provider Name Channel Name"] = List[1]
PAL_dic["Audio"] = List[3]
#print PAL_dic
#raw_input('List.count("PAL") == 0 and rowspan_count != rowspan_ori and PAL_count != 0')
if rowspan_count != 0:
rowspan_count = rowspan_count - 1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -