📄 urltracker.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 2 页
字号:
12 下一页
""" HarvestManUrlTracker.py - Module to track and download urls
    from the internet using urllib2. This software is part
    of the HarvestMan(R) program.

    Author: Anand B Pillai (anandpillai at letterboxes dot org).

    For licensing information see the file LICENSE.txt that
    is included in this distribution.

    Dependency
    ==========
    1. HarvestManHTMLParser.py
    2. HarvestManRobotParser.py
    3. HarvestManUrlPathParser.py
    4. HarvestManUrlThread.py
    5. HarvestManPageParser.py
    6. HarvestManUrlConnector.py

    Modification history

    Feb 10 2004   Anand   1.3.1 bug fix release.
    Jun 4-9 2004  Anand   1.4 development.

                          *Modified url/data push methods to use url
                          priorities. Priorities are assigned on the
                          basis of url generations and file types.
                          Webpages get higher priority than normal files
                          .Also priority can be specified in the configuration
                          file based on file extensions. All urls in the
                          same generation have the same priority. Urls in
                          a higher generation have lower priority than those
                          in a previous generation. (Added apply_url_priority
                          method).

                          *Assigned 2:1 ratio for fetchers:crawlers. Tests
                          showed that this ratio provided best download speeds
                          in most cases.

                          *Links are updated (for localising) only after
                          downloading the url, only if the download is successful.

                          *The main thread performs a 'join' on each tracker
                          instead of calling the 'stop' method at the end of
                          download. Joining is done with zero timeout.

                          *Added functionality to tidy html pages by using
                          uTidy, the python port of mxTidy. This helps to
                          crawl sites that used to exit due to html errors in
                          the starting page, in previous versions. Controlled
                          by the config variable named 'tidyhtml'.

    Jun 14 2004         Anand          1.3.9 release.                          
"""

import os, sys
import socket
import time
import math
import threading
import bisect
import random

from Queue import Queue, Full, Empty
from sgmllib import SGMLParseError

from common import *
import urlparser
import htmlparser
import pageparser

__USEEVENT__=0

class HarvestManUrlCrawlerException(Exception):

    def __init__(self, value):
        self.value = value

    def __str__(self):
        return str(self.value)

class HarvestManBaseUrlCrawler( threading.Thread ):
    """ Base class to do the crawling and fetching of internet/intranet urls.
    This is the base class with no actual code apart from the threading or
    termination functions. """

    def __init__(self, index, url_obj = None, isThread = True):
        # Index of the crawler
        self._index = index
        # Initialize my variables
        self._initialize()
        # Set url object
        self.set_url_object(url_obj)
        if isThread:
            threading.Thread.__init__(self, None, None, self.get_role() + str(self._index))

    def _initialize(self):
        """ Initialise my state after construction """

        # End flag
        self._endflag = False
        # Status of thread (This is different from the
        # thread alive status. This is a harvestman
        # crawler status )
        # 0 => Idle
        # 1 => Working
        # 2 => Deadlocked
        self._status = 0
        # Download flag
        self._download = True
        # My url
        self._url = ''
        # The url object
        self._urlobject = None
        # Data for the url
        self._data = ''
        # Number of loops
        self._loops = 0
        # Role string
        self._role = "undefined"
        # Specific role string
        self._sprole = "undefined"
        # Event object
        self._evt = threading.Event()
        self._evt.clear()
        
        # Harvestman config object
        self._configobj = GetObject('config')
        # Crawler queue object
        self._crawlerqueue = GetObject('trackerqueue')
        
    def __str__(self):
        return `self`

    def get_role(self):
        return self._role
        
    def get_specific_role(self):
        return self._sprole
    
    def set_specific_role(self, role):
        self._sprole = role

    def set_role(self, role):
        self._role = role
        
    def get_url(self):
        """ Return my url """

        return self._url

    def set_url(self, url):
        """ Set my url """

        self._url = url
        
    def set_download_flag(self, val = True):
        """ Set the download flag """
        self._download = val

    def set_url_object(self, obj):
        """ Set the url object of this crawler """

        self._urlobject = obj
        self._url = self._urlobject.get_full_url()
        pass

    def set_index(self, index):
        self._index = index

    def get_index(self):
        return self._index
    
    def get_url_object(self):
        """ Return the url object of this crawler """

        return self._urlobject
    
    def action(self):
        """ The action method, to be overridden by
        sub classes to provide action """

        pass
        
    def run(self):
        """ The overloaded run method of threading.Thread class """

        debug("Started crawler thread => ", self)
        self.action()

    def terminate(self):
        """ Kill this crawler thread """

        self.stop()
        msg = self.getName() + ' Killed'
        raise HarvestManUrlCrawlerException, msg

    def stop(self):
        """ Stop this crawler thread """

        self._status = 0
        self._endflag = True
        self.set_download_flag(False)
        
    def get_status(self):
        """ Return the running status of this crawler """
        
        return self._status

    def get_status_string(self):
        """ Return the running status of this crawler as a string """
        
        if self._status == 0:
            return "idle"
        elif self._status == 1:
            return "busy"
        elif self._status == 2:
            return "locked"
        
    def is_busy(self):
        """ Return whether I am busy or not """
        
        if self._status != 0:
            return True

        return False

    def is_locked(self):
        """ Return whether I am locked or not """

        if self._status == 2:
            return True

        return False

    def wait_for_data(self):
        """ Any thread calling this method will be blocked
        till this thread has pushed some data to the
        queue """
        
        self._evt.wait()
        return True

    def crawl_url(self):
        """ Crawl a web page, recursively downloading its links """

        pass

    def process_url(self):
        """ Download the data for a web page or a link and
        manage its data """

        pass
    

class HarvestManUrlCrawler(HarvestManBaseUrlCrawler):
    """ The crawler class which crawls urls and fetches their links.
    These links are posted to the url queue """

    def __init___(self, index, url_obj = None, isThread=True):
        HarvestManBaseUrlCrawler.__init__(self, index, url_obj, isThread)

    def _initialize(self):
        HarvestManBaseUrlCrawler._initialize(self)
        self._role = "crawler"
        self.__wp = None

    def __str__(self):
        return `self`

    def set_url_object(self, obj):

        if obj is None: return None

        prior, (url_obj, cdata) = obj
        # moreinfo('I am for url ->', url_obj.get_full_url() )
        import zlib
        self._data = zlib.decompress(cdata)
        HarvestManBaseUrlCrawler.set_url_object(self, url_obj)
        
    def action(self):
        
        if isinstance(self, threading.Thread):

            self._loops = 0

            while not self._endflag:
                obj = self._crawlerqueue.get_url_data( self.get_role(), self.get_specific_role() )
                if obj is None: continue
                
                self.set_url_object(obj)

                # Set status to one to denote busy state
                self._status = 1

                self.process_url()
                self.crawl_url()

                self._loops += 1
                time.sleep(0.5)       # Could be customized as a config variable
                # Set status to zero to denote idle state                
                self._status = 0
        else:
            self.process_url()
            self.crawl_url()


    def apply_url_priority(self, url_obj):
        """ Apply priority to url objects """

        cfg = GetObject('config')

        # Set initial priority to previous url's generation
        url_obj.set_priority( self._urlobject.get_generation())

        # Get priority
        curr_priority = url_obj.get_priority()

        # html files (webpages) get higher priority
        if url_obj.is_webpage():
            curr_priority -= 1

        # Apply any priorities specified based on file extensions in
        # the config file.
        pr_dict1, pr_dict2 = cfg.urlprioritydict, cfg.serverprioritydict
        # Get file extension
        extn = ((os.path.splitext(url_obj.get_filename()))[1]).lower()
        # Skip the '.'
        extn = extn[1:]

        # Get domain (server)
        domain = url_obj.get_domain()

        # Apply url priority
        if extn in pr_dict1.keys():
            curr_priority -= int(pr_dict1[extn])

        # Apply server priority, this allows a a partial
        # key match 
        for key in pr_dict2.keys():
            # Apply the first match
            if domain.find(key) != -1:
                curr_priority -= int(pr_dict2[domain])
                break
            
        # Set priority again
        url_obj.set_priority(curr_priority)
        # moreinfo('My priority is', curr_priority,'=> ', url_obj.get_full_url())
        
        return 1
                           
    def crawl_url(self):
        """ Crawl a web page, recursively downloading its links """

        if not self._urlobject.is_webpage(): return None
        if not self._download: return None
        if self.__wp is None:
            return None
        
        # Rules checker object
        ruleschecker = GetObject('ruleschecker')
        ruleschecker.add_link(self._url)

        # Data manager object
        dmgr = GetObject('datamanager')

        # Configuration object
        moreinfo('\nFetching links for url', self._url)

        links = self.__wp.links[0:]
            
        if self._configobj.images:
            links.extend(self.__wp.images)
        
        priority_indx = 0
        
        base_url = self._urlobject.get_full_url()
        
        for typ, childurl in links:

            # Check for status flag to end loop
            if self._endflag: break

            self._evt.clear()
            
            is_cgi, is_php = False, False

            if childurl.find('php?') != -1: is_php = True
            if type == 'form' or is_php: is_cgi = True
            
            try:
                url_obj = urlparser.HarvestManUrlParser(childurl,
                                                        typ,
                                                        is_cgi,
                                                        self._urlobject)
                
                url_obj.set_generation( self._urlobject.get_generation() + 1 )
            except urlparser.HarvestManUrlParserError, e:
                debug(str(e), childurl)
                continue

            # New in 1.2 (rc3) - get javascript links (.js)
            if typ == 'javascript':
                # moreinfo(" I found a javascript tag!")
                if not self._configobj.javascript:
                    continue
            elif typ == 'javaapplet':
                # moreinfo("I found a java applet class")
                if not self._configobj.javaapplet:
                    continue

            if ruleschecker.is_duplicate_link( url_obj.get_full_url() ):
                continue

            # Check for basic rules of download
            if url_obj.violates_rules():
                continue

            if self._configobj.fastmode:

                gl = self._crawlerqueue.is_locked_up('fetcher')
                
                if gl:
                    self._crawlerqueue.increment_lock_instance()
                    wht = HarvestManUrlFetcher(self._crawlerqueue.getLastTrackerIndex() + 1, url_obj)
                    # Set thread as daemon, so that threads can be killed
                    # cleanly without hanging Python program (main thread)
                    wht.setDaemon(True)
                    self._crawlerqueue.add_tracker( wht )
                    wht.start()

                # Thread is going to push data, set status to locked...
                self._status = 2

                priority_indx += 1
                self.apply_url_priority( url_obj )

                self._crawlerqueue.push( url_obj, self.get_role())
                    
                # Thread was able to push data, set status to busy...
                self._status = 1
                self._evt.set()
                
            else:      
                tracker = harvestManUrlTracker( self._index + 1, url_obj, False )
                tracker.action()

        if not self._evt.isSet():
            self._evt.set()
            
        return None

    def process_url(self):
        """ Parse the url's data and post new urls to the queue """

        if not self._urlobject.is_webpage(): return None

        extrainfo("Parsing web page ", self._url)
        try:
            if self._configobj.htmlparser==0:
                self.__wp = pageparser.harvestManSimpleParser()
            elif self._configobj.htmlparser==1:
                self.__wp = pageparser.harvestManFastParser()

            # use tidylib to clean up html data
            if self._configobj.tidyhtml:
                import tidy

                options=dict(indent=1, tidy_mark=1, fix_uri=1)
                self._data = str(tidy.parseString( self._data, **options ))
                
            self.__wp.feed(self._data)
            self.__wp.close()
12 下一页
💿 文件大小 153 K
👤 上传用户 xiaoexiao
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#网络
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -