indexingcrawler.py

来自「Harvestman-最新版本」· Python 代码 · 共 119 行

119 行

#!/usr/bin/env python"""indexingcrawler.py - Demonstrating custom crawler writing bysubscribing to events. This is a crawler which crawls a givenURL and indexes documents at the end of the crawl.Created by Anand B Pillai <abpillai at gmail dot com> Copyright (C) 2008 Anand B Pillai"""import __init__import sys, osimport PyLucenefrom harvestman.apps.spider import HarvestManfrom harvestman.lib.common.common import *from types import StringTypes# You can write pretty crazy custom crawlers by combining# different events and writing handlers for them ! :)class IndexingCrawler(HarvestMan):    """ A text indexing crawler using PyLucene """    # NOTE: This class performs work equivalent to the lucene plugin ...    def __init__(self):        super(IndexingCrawler, self).__init__()    def create_index(self):        """ Post download setup callback for creating a lucene index """        info("Creating lucene index")        count = 0        urllist = []        urldb = objects.datamgr.get_urldb()        storeDir = "index"        if not os.path.exists(storeDir):            os.mkdir(storeDir)        store = PyLucene.FSDirectory.getDirectory(storeDir, True)        lucene_writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True)            lucene_writer.setMaxFieldLength(1048576)                        for node in urldb.preorder():            urlobj = node.get()            # Only index if web-page or document            if not urlobj.is_webpage() and not urlobj.is_document(): continue            filename = urlobj.get_full_filename()            url = urlobj.get_full_url()            try:                urllist.index(urlobj.index)                continue            except ValueError:                urllist.append(urlobj.index)            if not os.path.isfile(filename): continue            data = ''            extrainfo('Adding index for URL',url)            try:                data = unicode(open(filename).read(), 'iso-8859-1')            except UnicodeDecodeError, e:                data = ''            try:                doc = PyLucene.Document()                doc.add(PyLucene.Field("name", 'file://' + filename,                                       PyLucene.Field.Store.YES,                                       PyLucene.Field.Index.UN_TOKENIZED))                doc.add(PyLucene.Field("path", url,                                       PyLucene.Field.Store.YES,                                       PyLucene.Field.Index.UN_TOKENIZED))                if data and len(data) > 0:                    doc.add(PyLucene.Field("contents", data,                                           PyLucene.Field.Store.YES,                                           PyLucene.Field.Index.TOKENIZED))                else:                    warning("warning: no content in %s" % filename)                lucene_writer.addDocument(doc)            except PyLucene.JavaError, e:                print e                continue                        count += 1        info('Created lucene index for %d documents' % count)        info('Optimizing lucene index')        lucene_writer.optimize()        lucene_writer.close()        def post_download_cb(self, event, *args, **kwargs):        self.create_index()if __name__ == "__main__":    spider=IndexingCrawler()    spider.initialize()    config = spider.get_config()    config.verbosity = 3    config.localise = 0    config.images = 0    config.pagecache = 0    spider.bind_event('postdownload', spider.post_download_cb)    spider.main()

indexingcrawler.py - 源码说明

本页面展示了「Harvestman-最新版本」中的 indexingcrawler.py 源码文件，采用 Python 编程语言编写，共 119 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与Harvestman相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?