📄 urltracker.py
字号:
""" HarvestManUrlTracker.py - Module to track and download urls
from the internet using urllib2. This software is part
of the HarvestMan(R) program.
Author: Anand B Pillai (anandpillai at letterboxes dot org).
For licensing information see the file LICENSE.txt that
is included in this distribution.
Dependency
==========
1. HarvestManHTMLParser.py
2. HarvestManRobotParser.py
3. HarvestManUrlPathParser.py
4. HarvestManUrlThread.py
5. HarvestManPageParser.py
6. HarvestManUrlConnector.py
Modification history
Feb 10 2004 Anand 1.3.1 bug fix release.
Jun 4-9 2004 Anand 1.4 development.
*Modified url/data push methods to use url
priorities. Priorities are assigned on the
basis of url generations and file types.
Webpages get higher priority than normal files
.Also priority can be specified in the configuration
file based on file extensions. All urls in the
same generation have the same priority. Urls in
a higher generation have lower priority than those
in a previous generation. (Added apply_url_priority
method).
*Assigned 2:1 ratio for fetchers:crawlers. Tests
showed that this ratio provided best download speeds
in most cases.
*Links are updated (for localising) only after
downloading the url, only if the download is successful.
*The main thread performs a 'join' on each tracker
instead of calling the 'stop' method at the end of
download. Joining is done with zero timeout.
*Added functionality to tidy html pages by using
uTidy, the python port of mxTidy. This helps to
crawl sites that used to exit due to html errors in
the starting page, in previous versions. Controlled
by the config variable named 'tidyhtml'.
Jun 14 2004 Anand 1.3.9 release.
"""
import os, sys
import socket
import time
import math
import threading
import bisect
import random
from Queue import Queue, Full, Empty
from sgmllib import SGMLParseError
from common import *
import urlparser
import htmlparser
import pageparser
__USEEVENT__=0
class HarvestManUrlCrawlerException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return str(self.value)
class HarvestManBaseUrlCrawler( threading.Thread ):
""" Base class to do the crawling and fetching of internet/intranet urls.
This is the base class with no actual code apart from the threading or
termination functions. """
def __init__(self, index, url_obj = None, isThread = True):
# Index of the crawler
self._index = index
# Initialize my variables
self._initialize()
# Set url object
self.set_url_object(url_obj)
if isThread:
threading.Thread.__init__(self, None, None, self.get_role() + str(self._index))
def _initialize(self):
""" Initialise my state after construction """
# End flag
self._endflag = False
# Status of thread (This is different from the
# thread alive status. This is a harvestman
# crawler status )
# 0 => Idle
# 1 => Working
# 2 => Deadlocked
self._status = 0
# Download flag
self._download = True
# My url
self._url = ''
# The url object
self._urlobject = None
# Data for the url
self._data = ''
# Number of loops
self._loops = 0
# Role string
self._role = "undefined"
# Specific role string
self._sprole = "undefined"
# Event object
self._evt = threading.Event()
self._evt.clear()
# Harvestman config object
self._configobj = GetObject('config')
# Crawler queue object
self._crawlerqueue = GetObject('trackerqueue')
def __str__(self):
return `self`
def get_role(self):
return self._role
def get_specific_role(self):
return self._sprole
def set_specific_role(self, role):
self._sprole = role
def set_role(self, role):
self._role = role
def get_url(self):
""" Return my url """
return self._url
def set_url(self, url):
""" Set my url """
self._url = url
def set_download_flag(self, val = True):
""" Set the download flag """
self._download = val
def set_url_object(self, obj):
""" Set the url object of this crawler """
self._urlobject = obj
self._url = self._urlobject.get_full_url()
pass
def set_index(self, index):
self._index = index
def get_index(self):
return self._index
def get_url_object(self):
""" Return the url object of this crawler """
return self._urlobject
def action(self):
""" The action method, to be overridden by
sub classes to provide action """
pass
def run(self):
""" The overloaded run method of threading.Thread class """
debug("Started crawler thread => ", self)
self.action()
def terminate(self):
""" Kill this crawler thread """
self.stop()
msg = self.getName() + ' Killed'
raise HarvestManUrlCrawlerException, msg
def stop(self):
""" Stop this crawler thread """
self._status = 0
self._endflag = True
self.set_download_flag(False)
def get_status(self):
""" Return the running status of this crawler """
return self._status
def get_status_string(self):
""" Return the running status of this crawler as a string """
if self._status == 0:
return "idle"
elif self._status == 1:
return "busy"
elif self._status == 2:
return "locked"
def is_busy(self):
""" Return whether I am busy or not """
if self._status != 0:
return True
return False
def is_locked(self):
""" Return whether I am locked or not """
if self._status == 2:
return True
return False
def wait_for_data(self):
""" Any thread calling this method will be blocked
till this thread has pushed some data to the
queue """
self._evt.wait()
return True
def crawl_url(self):
""" Crawl a web page, recursively downloading its links """
pass
def process_url(self):
""" Download the data for a web page or a link and
manage its data """
pass
class HarvestManUrlCrawler(HarvestManBaseUrlCrawler):
""" The crawler class which crawls urls and fetches their links.
These links are posted to the url queue """
def __init___(self, index, url_obj = None, isThread=True):
HarvestManBaseUrlCrawler.__init__(self, index, url_obj, isThread)
def _initialize(self):
HarvestManBaseUrlCrawler._initialize(self)
self._role = "crawler"
self.__wp = None
def __str__(self):
return `self`
def set_url_object(self, obj):
if obj is None: return None
prior, (url_obj, cdata) = obj
# moreinfo('I am for url ->', url_obj.get_full_url() )
import zlib
self._data = zlib.decompress(cdata)
HarvestManBaseUrlCrawler.set_url_object(self, url_obj)
def action(self):
if isinstance(self, threading.Thread):
self._loops = 0
while not self._endflag:
obj = self._crawlerqueue.get_url_data( self.get_role(), self.get_specific_role() )
if obj is None: continue
self.set_url_object(obj)
# Set status to one to denote busy state
self._status = 1
self.process_url()
self.crawl_url()
self._loops += 1
time.sleep(0.5) # Could be customized as a config variable
# Set status to zero to denote idle state
self._status = 0
else:
self.process_url()
self.crawl_url()
def apply_url_priority(self, url_obj):
""" Apply priority to url objects """
cfg = GetObject('config')
# Set initial priority to previous url's generation
url_obj.set_priority( self._urlobject.get_generation())
# Get priority
curr_priority = url_obj.get_priority()
# html files (webpages) get higher priority
if url_obj.is_webpage():
curr_priority -= 1
# Apply any priorities specified based on file extensions in
# the config file.
pr_dict1, pr_dict2 = cfg.urlprioritydict, cfg.serverprioritydict
# Get file extension
extn = ((os.path.splitext(url_obj.get_filename()))[1]).lower()
# Skip the '.'
extn = extn[1:]
# Get domain (server)
domain = url_obj.get_domain()
# Apply url priority
if extn in pr_dict1.keys():
curr_priority -= int(pr_dict1[extn])
# Apply server priority, this allows a a partial
# key match
for key in pr_dict2.keys():
# Apply the first match
if domain.find(key) != -1:
curr_priority -= int(pr_dict2[domain])
break
# Set priority again
url_obj.set_priority(curr_priority)
# moreinfo('My priority is', curr_priority,'=> ', url_obj.get_full_url())
return 1
def crawl_url(self):
""" Crawl a web page, recursively downloading its links """
if not self._urlobject.is_webpage(): return None
if not self._download: return None
if self.__wp is None:
return None
# Rules checker object
ruleschecker = GetObject('ruleschecker')
ruleschecker.add_link(self._url)
# Data manager object
dmgr = GetObject('datamanager')
# Configuration object
moreinfo('\nFetching links for url', self._url)
links = self.__wp.links[0:]
if self._configobj.images:
links.extend(self.__wp.images)
priority_indx = 0
base_url = self._urlobject.get_full_url()
for typ, childurl in links:
# Check for status flag to end loop
if self._endflag: break
self._evt.clear()
is_cgi, is_php = False, False
if childurl.find('php?') != -1: is_php = True
if type == 'form' or is_php: is_cgi = True
try:
url_obj = urlparser.HarvestManUrlParser(childurl,
typ,
is_cgi,
self._urlobject)
url_obj.set_generation( self._urlobject.get_generation() + 1 )
except urlparser.HarvestManUrlParserError, e:
debug(str(e), childurl)
continue
# New in 1.2 (rc3) - get javascript links (.js)
if typ == 'javascript':
# moreinfo(" I found a javascript tag!")
if not self._configobj.javascript:
continue
elif typ == 'javaapplet':
# moreinfo("I found a java applet class")
if not self._configobj.javaapplet:
continue
if ruleschecker.is_duplicate_link( url_obj.get_full_url() ):
continue
# Check for basic rules of download
if url_obj.violates_rules():
continue
if self._configobj.fastmode:
gl = self._crawlerqueue.is_locked_up('fetcher')
if gl:
self._crawlerqueue.increment_lock_instance()
wht = HarvestManUrlFetcher(self._crawlerqueue.getLastTrackerIndex() + 1, url_obj)
# Set thread as daemon, so that threads can be killed
# cleanly without hanging Python program (main thread)
wht.setDaemon(True)
self._crawlerqueue.add_tracker( wht )
wht.start()
# Thread is going to push data, set status to locked...
self._status = 2
priority_indx += 1
self.apply_url_priority( url_obj )
self._crawlerqueue.push( url_obj, self.get_role())
# Thread was able to push data, set status to busy...
self._status = 1
self._evt.set()
else:
tracker = harvestManUrlTracker( self._index + 1, url_obj, False )
tracker.action()
if not self._evt.isSet():
self._evt.set()
return None
def process_url(self):
""" Parse the url's data and post new urls to the queue """
if not self._urlobject.is_webpage(): return None
extrainfo("Parsing web page ", self._url)
try:
if self._configobj.htmlparser==0:
self.__wp = pageparser.harvestManSimpleParser()
elif self._configobj.htmlparser==1:
self.__wp = pageparser.harvestManFastParser()
# use tidylib to clean up html data
if self._configobj.tidyhtml:
import tidy
options=dict(indent=1, tidy_mark=1, fix_uri=1)
self._data = str(tidy.parseString( self._data, **options ))
self.__wp.feed(self._data)
self.__wp.close()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -