⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 web_parser.py

📁 web parser的範例程式
💻 PY
📖 第 1 页 / 共 2 页
字号:
import sys, re; r_iso = re.compile('([\x80-\xFF])')
import HTMLParser
import string
import urllib2
import os
import sys

def iso2utf(s): 
   def conv(m): 
      c = m.group(0)
      return ('\xC2'+c, '\xC3'+chr(ord(c) - 64))[ord(c) > 0xBF]
   return r_iso.sub(conv, s)

def save_web_content(url, filename):
    usock = urllib2.urlopen(url)
    data = usock.read()
   # raw_input("enter")
    usock.close()
    fp = open(filename, 'wb')
    #data = data.encode('utf-8')
    fp.write(iso2utf(data))
    fp.close()
    
def web_content_parser():
    file = open("web.txt","r")
    outName = "web_contentX.txt"
    index = 1

    while 1:
        line = file.readline()
        if not line:
            break
        #print line
        if line.count('<td width=80 align="left"><font face="Verdana" size=1>Source<br>Updated</td>') > 0:
          #  print line
            line = file.readline() #ignore </tr>
            fileOutName = string.replace(outName,"X",str(index))
          #  print fileOutName
          #  raw_input("enter")
            file_out = open(fileOutName,"w")
            while 1:
                line = file.readline()
                if line.count("</table>") > 0:
                   # raw_input("table")
                    break
                file_out.write(line)    
            file_out.flush()
            index = index + 1                             
            file_out.close()       
    return index - 1    
           
class WeaterHTMLParser(HTMLParser.HTMLParser):

    def handle_data(self, data):
        if len(data) != 1: 
            item.append(data)

    def unknown_decl(self, data):
        """Override unknown handle method to avoid exception"""
        pass

    def feed(self, data):
        """Feed data to the parser.

        Call this as often as you want, with as little or as much text
        as you want (may include '\n').
        """
        self.rawdata = self.rawdata + data
        returnValue =  self.goahead(0)





#------Step 1----------save the HTML content on url to local disk-------------------# 
#url = "http://www.lyngsat.com/astra23.html"
url = sys.argv[1]

filename = "web.txt"
save_web_content(url,filename)     


#------Step 2--------parse and devide the HTML content to seperate files----------#    
fileNum = web_content_parser()  




#------Step 3-------for each file parsed in step 2, we extract the information we want-------#            
Parser = WeaterHTMLParser()
PAL_dic = {"Freq.Tp":"None","Provider Name Channel Name":"None","Video Encryption":"PAL","Audio":"None"}
DVB_dic_1 = {"Freq.Tp":"None","Provider Name Channel Name":"None","Video Encryption":"DVB","SR-FEC":"None","NID-TID":"None"} 
DVB_dic_2 = {"Freq.Tp":"None","Provider Name Channel Name":"None","Video Encryption":"DVB","SID-VPID":"None","Audio":"None"} 
DVBS2_dic_1 = {"Freq.Tp":"None","Provider Name Channel Name":"None","Video Encryption":"DVB-S2","SR-FEC":"None","NID-TID":"None"}
DVBS2_dic_2 = {"Freq.Tp":"None","Provider Name Channel Name":"None","Video Encryption":"DVB-S2","SID-VPID":"None","Audio":"None"}

inputName = "web_contentX.txt"
outputName = "LyngSat_X.txt"
database = "dataBaseX.txt"
resultName = "database.txt"
List = []
TotalList = []
combine = ""
rowspan_count = 0
rowspan_ori = 0
PAL_count = 0
rowspan_flag = 1

os.mkdir("other output")
resultFile = open(resultName,"w")

for index in range(1,fileNum+1):
    fileInputName = string.replace(inputName,"X",str(index))
    file = open(fileInputName,"r")
    fileOutPutName = string.replace(outputName,"X",str(index))
    outPutFile = open(fileOutPutName,"w")
    databaseName = string.replace(database,"X",str(index))
    fileDatabase = open(databaseName,"w")
    
    while 1:
        item = []
        line = file.readline()
        #print line
        if not line:
            break
        if line.count("<tr>") > 0:
            List = []
        if line.count("td rowspan=") > 0 and rowspan_flag == 1:
            #raw_input('enter td rowspan')
            if line[13] != " ":
                rowspan_ori = eval(line[12:14])
            else:
                rowspan_ori = eval(line[12])
            #print rowspan_ori
            rowspan_count = rowspan_ori
            rowspan_flag  = 0
        if line.count("/tr") > 0:
            #print "rowspan_count"
            #print rowspan_count
            #print "rowspan_ori"
            #print rowspan_ori
            #print "PAL_count"
            #print PAL_count
            #raw_input('take a look')
            if len(List) < 4:
                continue 
            if List.count("PAL") > 0 and rowspan_count == rowspan_ori:
                PAL_count = rowspan_ori
                PAL_dic["Freq.Tp"] = List[1]
                PAL_dic["Provider Name Channel Name"] = List[3]
                PAL_dic["Audio"] = List[6]
                #print PAL_dic
                #raw_input('List.count("PAL") > 0 and rowspan_count == rowspan_ori ')
                if rowspan_count != 0:
                    rowspan_count = rowspan_count - 1
                if PAL_count != 0:
                    PAL_count = PAL_count - 1    
                outPutFile.write("Freq.Tp:")
                outPutFile.write(PAL_dic["Freq.Tp"])
                outPutFile.write("  ")
                outPutFile.write("Provider Name Channel Name:")
                outPutFile.write(PAL_dic["Provider Name Channel Name"])
                outPutFile.write("  ") 
                outPutFile.write("Video Encryption:")
                outPutFile.write(PAL_dic["Video Encryption"])
                outPutFile.write("  ") 
                outPutFile.write("Audio:")
                outPutFile.write(PAL_dic["Audio"])
                outPutFile.write("\n")     
            elif List.count("PAL") > 0 and rowspan_count != rowspan_ori:
                PAL_dic["Provider Name Channel Name"] = List[2]
                PAL_dic["Audio"] = List[5] 
                #print PAL_dic
                #raw_input('List.count("PAL") > 0 and rowspan_count != rowspan_ori ')
                if rowspan_count != 0:
                    rowspan_count = rowspan_count - 1
                if PAL_count != 0:
                    PAL_count = PAL_count - 1
                outPutFile.write("Freq.Tp:")
                outPutFile.write(PAL_dic["Freq.Tp"])
                outPutFile.write("  ")
                outPutFile.write("Provider Name Channel Name:")
                outPutFile.write(PAL_dic["Provider Name Channel Name"])
                outPutFile.write("  ") 
                outPutFile.write("Video Encryption:")
                outPutFile.write(PAL_dic["Video Encryption"])
                outPutFile.write("  ") 
                outPutFile.write("Audio:")
                outPutFile.write(PAL_dic["Audio"])
                outPutFile.write("\n")  
            elif List.count("PAL") == 0 and rowspan_count != rowspan_ori and PAL_count != 0 and len(List) == 4:
                PAL_dic["Provider Name Channel Name"] = List[1]
                PAL_dic["Audio"] = List[3] 
                #print PAL_dic
                #raw_input('List.count("PAL") == 0 and rowspan_count != rowspan_ori and PAL_count != 0')
                if rowspan_count != 0:
                    rowspan_count = rowspan_count - 1

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -