⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlparser.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 2 页
字号:
        if not match:
            return -1
        if report:
            j = match.start()
            self.handle_comment(rawdata[i+4: j])
        j = match.end()
        return j

    # Internal - parse script (javascript etc) code, return end or -1 if not terminated
    # This handler added abp 29-09-03
    def parse_script_tag(self, i, report=1):
        rawdata = self.rawdata
        assert rawdata[i:i+7].lower() == '<script', 'unexpected call to parse_script_tag()'
        match = jscriptclose.search(rawdata, i+7)
        if not match:
            return -1
        if report:
            j = match.start()
            # Handle only javascript tags for time being
            m = jscriptsrc.search(rawdata[i+7: j])
            # We handle only full javascript source links of the form
            # src="http://foo.com/bar/script.js"
            if m:
                matchstr = m.group()
                attrs=[]

                try:
                    tag, href = matchstr.split('=')
                    href = href.replace("'","")
                    href = href.replace('"', '')
                    attrs.append((tag, href))
                except:
                    pass
                
                self.handle_javascript('script', attrs)
                
        j = match.end()
        return j

    # Internal -- parse java applet tag, return end or -1 if not terminated
    # This handler added abp 29-09-03
    def parse_applet_tag(self, i, report=1):

        rawdata = self.rawdata
        assert rawdata[i:i+7].lower() == '<applet', 'unexpected call to parse_applet_tag()'
        match = appletclose.search(rawdata, i+7)
        if not match:
            return -1
        
        if report:
            j = match.start()

            attrs = []
            
            for rexps in (appletcode, appletcodebase):

                m = rexps.search(rawdata[i+7: j])
                if m:
                    matchstr = m.group()
                    try:
                        tag, href = matchstr.split('=')
                        href = href.replace("'","")
                        href = href.replace('"', '')
                        attrs.append((tag, href))                        
                    except:
                        pass
                
            self.handle_java_applet_tag('applet', attrs)
                
        j = match.end()
        return j
    
    # Internal -- parse processing instr, return end or -1 if not terminated
    def parse_pi(self, i):
        rawdata = self.rawdata
        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
        match = piclose.search(rawdata, i+2) # >
        if not match:
            return -1
        j = match.start()
        self.handle_pi(rawdata[i+2: j])
        j = match.end()
        return j

    # Internal -- handle starttag, return end or -1 if not terminated
    def parse_starttag(self, i):
        self.__starttag_text = None
        endpos = self.check_for_whole_start_tag(i)
        if endpos < 0:
            return endpos
        rawdata = self.rawdata
        self.__starttag_text = rawdata[i:endpos]

        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        match = tagfind.match(rawdata, i+1)
        assert match, 'unexpected call to parse_starttag()'
        k = match.end()
        self.lasttag = tag = rawdata[i+1:k].lower()

        while k < endpos:
            m = attrfind.match(rawdata, k)
            if not m:
                break
            attrname, rest, attrvalue = m.group(1, 2, 3)
            if not rest:
                attrvalue = None
            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
                attrvalue = self.unescape(attrvalue)
            attrs.append((attrname.lower(), attrvalue))
            k = m.end()

        end = rawdata[k:endpos].strip()
        # Modification, added empty string in list this skips
        # some errored html tags without quitting. (Anand)

        if end not in ("", ",", ">", "/>"):
            lineno, offset = self.getpos()
            if "\n" in self.__starttag_text:
                lineno = lineno + self.__starttag_text.count("\n")
                offset = len(self.__starttag_text) \
                         - self.__starttag_text.rfind("\n")
            else:
                offset = offset + len(self.__starttag_text)
            self.error("junk characters in start tag: %s"
                       % `rawdata[k:endpos][:20]`)
        if end.endswith('/>'):
            # XHTML-style empty tag: <span attr="value" />
            self.handle_startendtag(tag, attrs)
        else:
            self.handle_starttag(tag, attrs)
            if tag in self.CDATA_CONTENT_ELEMENTS:
                self.set_cdata_mode()
        return endpos

    # Internal -- check to see if we have a complete starttag; return end
    # or -1 if incomplete.
    def check_for_whole_start_tag(self, i):
        rawdata = self.rawdata
        m = locatestarttagend.match(rawdata, i)
        if m:
            j = m.end()
            next = rawdata[j:j+1]
            if next == ">":
                return j + 1
            if next == "/":
                if rawdata.startswith("/>", j):
                    return j + 2
                if rawdata.startswith("/", j):
                    # buffer boundary
                    return -1
                # else bogus input
                self.updatepos(i, j + 1)
                self.error("malformed empty start tag")
            if next == "":
                # end of input
                return -1
            if next in ("abcdefghijklmnopqrstuvwxyz=/"
                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
                # end of input in or before attribute value, or we have the
                # '/' from a '/>' ending
                return -1
            self.updatepos(i, j)
            self.error("malformed start tag")
        raise AssertionError("we should not get here!")

    # Internal -- parse endtag, return end or -1 if incomplete
    def parse_endtag(self, i):
        rawdata = self.rawdata
        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
        match = endendtag.search(rawdata, i+1) # >
        if not match:
            return -1
        j = match.end()
        match = endtagfind.match(rawdata, i) # </ + tag + >
        if not match:
            self.error("bad end tag: %s" % `rawdata[i:j]`)
        tag = match.group(1)
        self.handle_endtag(tag.lower())
        self.clear_cdata_mode()
        return j

    # Overridable -- finish processing of start+end tag: <tag.../>
    def handle_startendtag(self, tag, attrs):
        self.handle_starttag(tag, attrs)
        self.handle_endtag(tag)

    # Overridable -- handle start tag
    def handle_starttag(self, tag, attrs):
        pass

    # Overridable -- handle end tag
    def handle_endtag(self, tag):
        pass

    # Overridable -- handle character reference
    def handle_charref(self, name):
        pass

    # Overridable -- handle entity reference
    def handle_entityref(self, name):
        pass

    # Overridable -- handle data
    def handle_data(self, data):
        pass

    # Overridable -- handle comment
    def handle_comment(self, data):
        pass

    # Overridable - handles java scripts
    def handle_javascript(self, tag, attrs):
        pass

    # Overridable - handles java applet tags
    def handle_java_applet_tag(self, tag, attrs):
        pass    

    # Overridable -- handle declaration
    def handle_decl(self, decl):
        pass

    # Overridable -- handle processing instruction
    def handle_pi(self, data):
        pass

    def unknown_decl(self, data):
        self.error("unknown declaration: " + `data`)

    # Internal -- helper to remove special character quoting
    def unescape(self, s):
        if '&' not in s:
            return s
        s = s.replace("&lt;", "<")
        s = s.replace("&gt;", ">")
        s = s.replace("&apos;", "'")
        s = s.replace("&quot;", '"')
        s = s.replace("&amp;", "&") # Must be last
        return s

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -