⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 lucene.py

📁 Harvestman-最新版本
💻 PY
字号:
# -- coding: utf-8""" Lucene plugin to HarvestMan. This plugin modifies thebehaviour of HarvestMan to create an index of crawledwebpages by using PyLucene.Author: Anand B Pillai <abpillai at gmail dot com>Created  Aug 7 2007     Anand B Pillai <abpillai at gmail dot com>Copyright (C) 2007 Anand B Pillai"""__version__ = '2.0 b1'__author__ = 'Anand B Pillai'import PyLuceneimport sys, osimport timefrom harvestman.lib import hooksfrom harvestman.lib.common.common import *class PorterStemmerAnalyzer(object):    def tokenStream(self, fieldName, reader):        result = PyLucene.StandardTokenizer(reader)        result = PyLucene.StandardFilter(result)        result = PyLucene.LowerCaseFilter(result)        result = PyLucene.PorterStemFilter(result)        result = PyLucene.StopFilter(result, PyLucene.StopAnalyzer.ENGLISH_STOP_WORDS)        return resultdef create_index(self, arg):    """ Post download setup callback for creating a lucene index """    moreinfo("Creating lucene index")    storeDir = "index"    if not os.path.exists(storeDir):        os.mkdir(storeDir)    store = PyLucene.FSDirectory.getDirectory(storeDir, True)        self.lucene_writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True)    # Uncomment this line to enable a PorterStemmer analyzer    # self.lucene_writer = PyLucene.IndexWriter(store, PorterStemmerAnalyzer(), True)        self.lucene_writer.setMaxFieldLength(1048576)        count = 0    urllist = []        for node in self._urldb.preorder():        urlobj = node.get()        # Only index if web-page or document        if not urlobj.is_webpage() and not urlobj.is_document(): continue                filename = urlobj.get_full_filename()        url = urlobj.get_full_url()        try:            urllist.index(urlobj.index)            continue        except ValueError:            urllist.append(urlobj.index)        if not os.path.isfile(filename): continue                data = ''        moreinfo('Adding index for URL',url)        try:            data = unicode(open(filename).read(), 'iso-8859-1')        except UnicodeDecodeError, e:            data = ''        try:            doc = PyLucene.Document()            doc.add(PyLucene.Field("name", 'file://' + filename,                                   PyLucene.Field.Store.YES,                                   PyLucene.Field.Index.UN_TOKENIZED))            doc.add(PyLucene.Field("path", url,                                   PyLucene.Field.Store.YES,                                   PyLucene.Field.Index.UN_TOKENIZED))            if data and len(data) > 0:                doc.add(PyLucene.Field("contents", data,                                       PyLucene.Field.Store.YES,                                       PyLucene.Field.Index.TOKENIZED))            else:                extrainfo("warning: no content in %s" % filename)            self.lucene_writer.addDocument(doc)        except PyLucene.JavaError, e:            print e                    count += 1    moreinfo('Created lucene index for %d documents' % count)    moreinfo('Optimizing lucene index')    self.lucene_writer.optimize()    self.lucene_writer.close()        def apply_plugin():    """ Apply the plugin - overrideable method """    # This method is expected to perform the following steps.    # 1. Register the required hook/plugin function    # 2. Get the config object and set/override any required settings    # 3. Print any informational messages.    # The first step is required, the last two are of course optional    # depending upon the required application of the plugin.    cfg = objects.config    hooks.register_post_callback_method('datamgr:post_download_setup_callback',                                        create_index)    #logger.disableConsoleLogging()    # Turn off session-saver feature    cfg.savesessions = False    # Turn off interrupt handling    # cfg.ignoreinterrupts = True    # No need for localising    cfg.localise = 0    # Turn off image downloading    cfg.images = 0    # Turn off caching    cfg.pagecache = 0

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -