📄 urlcollections.py
字号:
# -- coding: utf-8"""urlcollections.py - Module which defines URL collectionand context classes.URL collection classes allow a programmer tocreate collections (aggregations) of URL objectswith respect to certain contexts. This allows totreat URL objects belonging to the collection (and hencethe context) as a single unit allowing you to writecode based on the context rather than based onthe URL.Examples of contexts include stylesheet contextwhere a web-page and its CSS files forms part ofthis context. Other examples are frame contexts, wherea context is associated to all frame URLs originatingfrom a web-page and page contexts, which basicallyassociates all URLs in page to the page URL.This module is part of the HarvestMan program.For licensing information see the file LICENSE.txt thatis included in this distribution.Author: Anand B Pillai <abpillai at gmail dot com>Created Anand B Pillai April 17 2007 Based on inputs from the EIAO project.Mod Anand B Pillai May 26 2007 Added HarvestManAutoUrlCollection class which automatically categorizes URLs to contexts. Also, modified HarvestManUrlCollection class so that a collection class can be associated to multiple contexts. Copyright (C) 2007, Anand B Pillai."""__version__ = '2.0 b1'__author__ = 'Anand B Pillai'from harvestman.lib import urltypesfrom harvestman.lib.urlparser import HarvestManUrlclass HarvestManUrlCollectionException(Exception): """ Exception class for collections """ passclass HarvestManUrlContext(object): """ This class defines the base URL context type for HarvestMan """ # Name for the context name = 'BASE_URL_CONTEXT' # Description for the context description = 'Base type for URL contexts' # Source URL type for the context sourceurltype = urltypes.URL_TYPE_ANY # Bag URL types for the context bagurltype = urltypes.URL_TYPE_ANY class HarvestManPageContext(HarvestManUrlContext): """ Page context class. This context ties a webpage URL with its child URLs """ # Name for the context name = 'PAGE_URL_CONTEXT' # Description for the context description = 'Context type associating a page to its children' # Source URL type for the context sourceurltype = urltypes.URL_TYPE_WEBPAGE # Bag URL types for the context bagurltype = urltypes.URL_TYPE_ANY class HarvestManFrameContext(HarvestManPageContext): """ Frame context. This context ties a frameset URL to the frame URLs """ # Name for the context name = 'FRAME_URL_CONTEXT' # Description of the context description = 'Context for tying a frameset URL to its frame URLs' # Source URL type for the context sourceurltype = urltypes.URL_TYPE_FRAMESET # Bag URL types for the context bagurltype = urltypes.URL_TYPE_FRAMEclass HarvestManStyleContext(HarvestManPageContext): """ Style context. This context ties a webpage URL to its stylesheet (css) URLs """ # Name for the context name = 'STYLE_URL_CONTEXT' # Description of the context description = 'Context for tying a webpage to its stylesheets' # Source URL type for the context sourceurltype = urltypes.URL_TYPE_WEBPAGE # Bag URL types for the context bagurltype = urltypes.URL_TYPE_STYLESHEETclass HarvestManCSSContext(HarvestManPageContext): """ CSS context. This context ties a stylesheet URL to any URLs defined inside the stylesheet """ # Name for the context name = 'CSS_URL_CONTEXT' # Description of the context description = 'Context for tying a stylesheet to its child URLs' # Source URL type for the context sourceurltype = urltypes.URL_TYPE_STYLESHEET # Bag URL types for the context bagurltype = urltypes.URL_TYPE_ANYclass HarvestManCSS2Context(HarvestManCSSContext): """ CSS2 context. This context ties a stylesheet URL to any other stylesheets imported in it """ # Name for the context name = 'CSS2_URL_CONTEXT' # Description of the context description = 'Context for tying a stylesheet to any stylesheets imported in it' # Source URL type for the context sourceurltype = urltypes.URL_TYPE_STYLESHEET # Bag URL types for the context bagurltype = urltypes.URL_TYPE_STYLESHEET class HarvestManUrlCollection(object): """ URL collection classes for HarvestMan """ # This class is designed as a bag for HarvestManUrl # objects, tied to a context. The key attributes of this # class are a list of such url objects, a main URL # object from which the context originates (the 'source' # URL object) and a corresponding context. def __init__(self, source = None): # For efficiency purposes, we do not # keep reference to urlobjects, only their indices. if source: self._source = source.index self._sourcetyp = source.typ else: self._source = None self._sourcetyp = urltypes.URL_TYPE_NONE self._collections = {} def _getContext(self, urlobj): """ Return the context at which the URL urlobj is to be inserted """ # This class always returns HarvestManPageContext return HarvestManPageContext def addURL(self, urlobj): """ Add a url object to the collection """ # Check if the type of the urlobject matches the # bagurltype defined for this context. Here we # do a isA check since the url object's type can # be a specialized form (derived class) of the # bagurltype. if not isinstance(urlobj, HarvestManUrl): raise HarvestManUrlCollectionException, 'Error: Wrong argument type, expecting HarvestManUrl instance!' # For efficiency on memory, we do not append # url objects to the list, only their indices. # Url objects can be mapped out later using their # index from the datamgr object. # Context is always HarvestManPageContext context = self._getContext(urlobj) # print 'CONTEXT for URL %s=>%s' % (urlobj.get_full_url(), context) if urlobj.typ.isA(context.bagurltype): # Check if this context exists as key in the collections dictionary try: listofurls = self._collections[context] listofurls.append(urlobj.index) except KeyError: self._collections[context] = [urlobj.index] else: raise HarvestManUrlCollectionException, 'Error: mismatch in context and bag URL types!' def getSourceURL(self): """ Return the source URL object """ return self._source def getSourceURLType(self): """ Return the type of the source URL object """ return self._sourcetyp def getURLs(self, context): """ Get list of URL objects for the given context """ return self._collections.get(context) def getAllURLs(self): """ Get list of all URL objects for this collection """ allurls = [] for urls in self._collections.values(): allurls.extend(urls) return allurls def getContextDict(self): """ Returns a copy of the internal context dictionary """ return self._collections.copy() class HarvestManAutoUrlCollection(HarvestManUrlCollection): """ A sub-class of HarvestManUrlCollection which automatically assigns contexts to URLs """ def _getContext(self, urlobj): """ Return the context at which the URL urlobj is to be inserted """ # For frames, return HarvestManFrameContext # For CSS files # 1. Source => webpage, return HarvestManStyleContext # 2. Source => stylesheet, return HarvestManCSS2Context # For other URLs # 1. Source => webpage, return HarvestManPageContext # 2. Source => stylesheet, return HarvestManCSSContext if urlobj.typ == urltypes.URL_TYPE_FRAME: return HarvestManFrameContext if urlobj.typ == urltypes.URL_TYPE_STYLESHEET: # If source is web-page if self._sourcetyp.isA(urltypes.URL_TYPE_WEBPAGE): return HarvestManStyleContext elif self._sourcetyp.isA(urltypes.URL_TYPE_STYLESHEET): return HarvestManCSS2Context else: # For all other url types if self._sourcetyp.isA(urltypes.URL_TYPE_STYLESHEET): return HarvestManCSSContext else: return HarvestManPageContext
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -