📄 datamgr.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 2 页
字号:
12 下一页
""" HarvestManDataManager.py - Data manager module for HarvestMan.
    This software is part of the HarvestMan(R) program.

    Author: Anand B Pillai (anandpillai at letterboxes dot org).
    
    Copyright (C) 2004-2005 Anand B Pillai.

    Created July 29 2003      Anand  Modifications for 1.1 release.
        (New module)

    Modification History

  Oct 1 2003        Anand                   Fixed bugs with localizing anchor type links.
  Jan 2 2004        Anand                   1.3.1 bug fix version.

  Jun 4-9 2004      Anand                   1.4 development

                                            *Fixed a bug in localising links.
                                            Directory urls need to be skipped.

  Jun 14 2004      Anand                    1.3.9 release.                                            

"""

import os, sys
import time
import math
import binascii
import shutil

from threading import Lock, RLock, Condition
# Utils
import utils

from urlthread import harvestManUrlThreadPool
from connector import *
from common import *

# __metaclass__ = object

class harvestManDataManager:
    """ The data manager cum indexer class """

    def __init__(self):

        self._numfailed = 0
        self._projectcache = []
        self._downloaddict = { '_savedfiles': [],
                               '_deletedfiles': [],
                               '_failedurls' : [],
                               '_cacheinfo' : [],
                               '_invalidurls': [],
                               '_validurls' : [],
                             }
        # Used for localizing links
        self._linksdict = {}
        # Cache for connectors
        self._connectorcache = {}
        # byte count
        self._bytes = 0L
        # Redownload flag
        self._redownload = False
        # data lock
        self._dataLock = Condition(RLock())
        self._dataLock2 = Condition(RLock())        
        # Url thread group class for multithreaded downloads
        self._urlThreadPool = harvestManUrlThreadPool()
        self._urlThreadPool.spawn_threads()

    def get_proj_cache_filename(self):
        """ Return the cache filename for the current project """

        # Note that this function does not actually build the cache directory.
        configobj = GetObject('config')
        # Get the cache file path
        cachedir = os.path.join(configobj.projdir, "hm-cache")
        cachefilename = os.path.join(cachedir, configobj.project + ".hmc")

        return cachefilename

    def get_links_dictionary(self):
        return self._linksdict

    def read_project_cache(self):
        """ Try to read the project cache file """

        # Get cache filename
        moreinfo('Reading Project Cache...')
        cachereader = utils.HarvestManCacheManager(self.get_proj_cache_filename())
        obj = cachereader.read_project_cache()

        if obj is not None:
            self._projectcache = obj

        return

    def is_url_cache_uptodate(self, url, filename, contentlen, urldata):
        """ Check with project cache and find out if the
        content needs update """

        # Sep 16 2003, fixed a bug in this, we need to check
        # the file existence also.

        # If page caching is not enabled, return False
        # straightaway!
        if not GetObject('config').pagecache:
            return False

        # Return True if cache is uptodate(no update needed)
        # and Fals if cache is out-of-date(update needed)
        # NOTE: We are using an comparison of the md5 checksum of
        # the file's data with the md5 checksum of the cache file.

        if contentlen==0:
            return False

        # Look up the dictionary containing the url cache info
        import md5

        m1=md5.new()
        mdigest1=m1.update(urldata)
        digest1=m1.digest()

        # Assume that cache is not uptodate apriori
        uptodate=False
        keyfound = False
        fileverified=False

        # Reference to dictionary in the cache list
        cachekey = {}
        for x in self._projectcache:
            if x.has_key('url'):
                if x['url'] == url:
                    cachekey = x
                    keyfound = True
                    x['updated']=False

                    fileloc = x['location']
                    if os.path.exists(fileloc) and os.path.abspath(fileloc) == os.path.abspath(filename):
                        fileverified=True
			
                    if x.has_key('checksum'):
                    	# This value is stored hex encrypted in the cache file
                   	 # (and hence the dictionary). In order to compare, we need
                    	# to decrypt it, (or we need to compare it with the encrypted
                    	# copy of the file digest, as we are doing now).

                    	cachemd5 = bin_decrypt(x['checksum'])
                    	if binascii.hexlify(cachemd5) == binascii.hexlify(digest1) and fileverified:
                        	uptodate=True

                    	break
		    else:
			break

        # If cache is not updated, update all cache keys
        if not uptodate:
            cachekey['checksum'] = bin_crypt(digest1)
            cachekey['location'] = filename
            cachekey['content-length'] = contentlen
            cachekey['updated'] = True

            # If key was not found, we need to create a new
            # entry and append it to the list, otherwise the
            # cache dictionary is modified in place.
            if not keyfound:
                cachekey['url'] = url
                self._projectcache.append(cachekey)

        # If both checksums are equal, return True
        # else return False.
        return uptodate

    def is_url_uptodate(self, url, filename, lmt):
        """ New function to check whether the url cache is out
        of date by comparing last modified time """

        # If page caching is not enabled, return False
        # straightaway!
        if not GetObject('config').pagecache:
            return False

        # Assume that cache is not uptodate apriori
        uptodate=False
        keyfound = False
        FILEVerified=False

        # Reference to dictionary in the cache list
        cachekey = {}
        for x in self._projectcache:
            if x.has_key('url'):
                if x['url'] == url:
                    cachekey = x
                    keyfound = True
                    x['updated']=False

                    fileloc = x['location']
                    if os.path.exists(fileloc) and os.path.abspath(fileloc) == os.path.abspath(filename):
                        fileverified=True

                    if x.has_key('last-modified'):
                        # Get current modified time
                        cmt = x['last-modified']
                        # If the latest page has a modified time greater than this
                        # page is out of date, otherwise it is uptodate
                        if lmt<=cmt:
                          uptodate=True
                    else:
                        break

                    break

        # If cache is not updated, update all cache keys
        if not uptodate:
            cachekey['location'] = filename
            cachekey['last-modified'] = lmt
            cachekey['updated'] = True

            # If key was not found, we need to create a new
            # entry and append it to the list, otherwise the
            # cache dictionary is modified in place.
            if not keyfound:
                cachekey['url'] = url
                self._projectcache.append(cachekey)

        # If both checksums are equal, return True
        # else return False.
        return uptodate

    def conditional_cache_set(self):
        """ A utility function to conditionally enable/disable
        the cache mechanism """

        # If the cache file exists for this project, disable
        # cache, else enable it.
        cfg = GetObject('config')

        cachefilename = self.get_proj_cache_filename()

        if os.path.exists(cachefilename) and os.path.getsize(cachefilename):
            cfg.pagecache = False
        else:
            cfg.pagecache = True

    def does_cache_need_update(self):
        """ Find out if project cache needs update """

        # If any of the dictionary entries has the key
        # value for 'updated' set to True, the cache needs
        # update, else not.
        needsupdate=False
        for x in self._projectcache:
            if x.has_key('updated'):
                if x['updated']:
                    needsupdate=x['updated']
                    break

        return needsupdate

    def fill_project_cache(self, name, value, fileloc):
        """ This function fills the project cache from an
        existing project cache file """

        # if the tagname is 'location', add this entry to the list
        if name=='location':
            cachedict={}
            cachedict[name]=value
            self._projectcache.append(cachedict)
            return 0

        # Otherwise, lookup the location key in the list of dictionaries
        # and add this new name/value pair in the dictionary
        else:
            ok=False
            for x in self._projectcache:
                if x.has_key('location'):
                    # We dont want to overwrite the key's value
                    if not x.has_key(name):
                        if x['location'] == fileloc:
                            # Add the new key/value pair
                            # print name, value
                            ok=True
                            x[name]= value
                            break
        if ok:
            return 0
        else:
            return -1

    def post_download_setup(self):
        """ Actions to perform after project is complete """

        configobj = GetObject('config')

        if configobj.retryfailed:
            self._numfailed = len(self._downloaddict['_failedurls'])
            moreinfo(' ')
            # try downloading again
            # mod: made this multithreaded
            moreinfo('Redownloading failed links...')

            if self._numfailed:
                self._redownload=True

                for urlobj in self._downloaddict['_failedurls']:
                    self.download_url( urlobj )

        # bugfix: Moved the time calculation code here.
        if sys.platform == 'win32' or os.name=='nt':
            t2=time.clock()
        else:
            t2=time.time()

        configobj.endtime = t2

        # Write cache file
        if configobj.pagecache and self.does_cache_need_update():
            cachewriter = utils.HarvestManCacheManager( self.get_proj_cache_filename() )
            cachewriter.write_project_cache(self._projectcache)

        # localise downloaded file's links, dont do if jit localisation
        # is enabled.
        if configobj.localise and not configobj.jitlocalise:
            self.localise_links()

        #  Get handle to rules checker object
        ruleschecker = GetObject('ruleschecker')
        # dump downloaded urls to a text file
        if configobj.urlslistfile:
            # Get urls list file
            ruleschecker.dump_urls( configobj.urlslistfile )

        # print stats of the project
        nlinks, nservers, ndirs = ruleschecker.get_stats()
        nfailed = self._numfailed
        numstillfailed = len(self._downloaddict['_failedurls'])
        numfiles = len(self._downloaddict['_savedfiles'])
        numfilesincache = len(self._downloaddict['_cacheinfo'])

        numretried = self._numfailed  - numstillfailed
        fetchtime = float((math.modf((configobj.endtime-configobj.starttime)*100.0)[1])/100.0)

        statsd = { 'links' : nlinks,
                   'extservers' : nservers,
                   'extdirs' : ndirs,
                   'failed' : nfailed,
                   'fatal' : numstillfailed,
                   'files' : numfiles,
                   'filesincache' : numfilesincache,
                   'retries' : numretried,
                   'fetchtime' : fetchtime,
                }

        self.print_project_info(statsd)

    def update_bytes(self, count):
        """ Update the global byte count """

        try:
            self._dataLock.acquire()
            self._bytes += count
        finally:
            self._dataLock.release()

    def update_dead_links(self, url):
        """ Add this link to the 404 (dead links) database """

        try:
            self._dataLock.acquire()
            try:
                self._downloaddict['_invalidurls'].index(url)
            except:
                self._downloaddict['_invalidurls'].append(url)
        finally:
            self._dataLock.release()

    def update_valid_links(self, url):
        """ Add this link to the working links database """

        try:
            self._dataLock.acquire()
            try:
                self._downloaddict['_validurls'].index(url)
            except:
                self._downloaddict['_validurls'].append(url)
        finally:
            self._dataLock.release()

    def update_connector_cache(self, conn, urlobj):
        """ Add this connector to the connector cache """

        try:
            self._dataLock.acquire()
            self._connectorcache[urlobj] = conn
        finally:
            self._dataLock.release()

    def is_a_dead_link(self, url):
        """ Check whether the passed url is a dead (404) link """

        dead = False
        try:
            self._downloaddict['_invalidurls'].index( url )
            dead = True
        except:
            pass

        return dead

    def update_failed_files(self, urlObject):
        """ Add the passed information to the failed files list """

        if self._redownload: return -1

        # From version 1.1, we try to pass around instances of
        # urlPathParser objects instead of urls or filenames or
        # their tuples. Many functions have changed, this is one
        # of them.

        # It makes sense to add the complte urlPathParser object
        # since, we can get all information from it later on.
        try:
            self._dataLock.acquire()
            try:
                self._downloaddict['_failedurls'].index(urlObject)
            except:
                self._downloaddict['_failedurls'].append(urlObject)
        finally:
            self._dataLock.release()

    def update_file_stats(self, urlObject, status):
        """ Add the passed information to the saved file list """

        try:
            self._dataLock.acquire()
12 下一页
💿 文件大小 153 K
👤 上传用户 xiaoexiao
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#网络
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -