test_pageparser.py

来自「Harvestman-最新版本」· Python 代码 · 共 107 行

107 行

# -- coding: utf-8""" Unit test for pageparser moduleCreated: Anand B Pillai <abpillai@gmail.com> Jul 17 2008Copyright (C) 2008, Anand B Pillai."""import test_baseimport unittestimport sys, osimport timetest_base.setUp()from harvestman.lib.pageparser import HarvestManSimpleParser, HarvestManSGMLOpParser, HarvestManCSSParserfrom harvestman.lib.urlparser import HarvestManUrl    from harvestman.lib.common.macros import *from harvestman.lib.urltypes import *from sgmllib import SGMLParseErrorcurdir = os.path.abspath(os.path.dirname(test_base.__file__))class Link:    def __init__(self, typ, url):        self.typ = typ        self.url = url    def __eq__(self, link):        return link==self.url        def Linkify(links):    Links = []    for typ,url in links:        Links.append(Link(typ, url))    return Linksclass TestHarvestManPageParser(unittest.TestCase):    """ Unit test class for all classes in pageparser module """    # Supported tags    tags = ('a','frame','img','form','link','body','script',            'applet','area','meta','embed','object','option')        def test_simpleparser(self):        # Test features (tags)        for tag in self.tags:            assert(tag in HarvestManSimpleParser.features)        # Parse test        p=HarvestManSimpleParser()        p.feed(open(os.path.join(curdir, 'pass.html')).read())        # There should be 29 links and 4 images        assert(len(p.links)==29)        assert(len(p.images)==4)        assert(p.keywords==['crawler', 'spider', 'bot', 'web-bot', 'robot', 'offline', 'browser', 'web', 'internet', 'harvest', 'harvestman', 'http', 'browsing', 'searching', 'python', 'tools', 'aggregator', 'mining', 'intelligent', 'agents', 'agent-based computing', 'autonomous', 'documents'])        assert(p.description=="Project page of the HarvestMan WebCrawler")        assert(p.title=='The HarvestMan WebCrawler')                link_urls = Linkify(p.links)        # There should be a stylesheet link        assert('style.css' in link_urls)        # There will be an anchor link        l = link_urls.index('download.html#latest')        assert(link_urls[l].typ==URL_TYPE_ANCHOR)        image_urls = Linkify(p.images)        assert('images/HarvestMan_s.jpg' in image_urls)        p.reset()        try:            # This page shoud fail the parser...            p.feed(open(os.path.join(curdir, 'fail.html')).read())            assert()        except SGMLParseError:            pass    def test_sgmlopparser(self):        # There is only one test, i.e the fail page        # should parse with this parser.        try:            p=HarvestManSGMLOpParser()            # This page shoud not fail the parser...            p.feed(open(os.path.join(curdir, 'fail.html')).read())            assert(len(p.links)==4)        except Exception:            assert()            pass    def test_cssparser(self):        p = HarvestManCSSParser()        p.feed(open(os.path.join(curdir, 'pass.css')).read())        assert(p.links==['css1.css','css2.css','fancybullet.gif'])        assert(p.csslinks==['css1.css','css2.css'])        def run(result):    return test_base.run_test(TestHarvestManPageParser, result)if __name__=="__main__":    s = unittest.makeSuite(TestHarvestManPageParser)    unittest.TextTestRunner(verbosity=2).run(s)    test_base.clean_up()

test_pageparser.py - 源码说明

本页面展示了「Harvestman-最新版本」中的 test_pageparser.py 源码文件，采用 Python 编程语言编写，共 107 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与Harvestman相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?