⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 common.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 2 页
字号:
""" Global functions for HarvestMan(R) Program.
This file is part of the HarvestMan(R) software.
For licensing information, see file LICENSE.TXT.

Author: Anand B Pillai (anandpillai at letterboxes dot org).

Created: Jun 10 2003

Jun 4-9 2004 Anand   1.4 development version.

                     *GetObject() returns a weakref
                     proxy instead of another reference
                     to the stored objects.

                     *Rewrote global registry class.
                     Renamed it to 'Registry' and made it
                     a normal singleton. Added __slots__.
                     Added methods get_object_key and
                     get_class_key.

                     *SetObject() finds out object key from
                     registry object, so an extra argument
                     is omitted.

  Jun 14 2004         Anand          1.3.9 release.                     

"""

import weakref
import os, sys
import binascii

__all__ = [ "varprint", "info", "moreinfo", "extrainfo", "debug", "moredebug",
           "wasOrWere", "plural", "filetype", "rename", "obfuscate", "unobfuscate",
            "bin_crypt", "bin_decrypt", "htmlLikeFile", "Initialize", "Finish", "SetUserAgent",
            "GetObject", "GetRegistryObject",
            "SetObject", "SetUserDebug", "HARVESTMAN_SIG", "HARVESTMAN_PROJECTINFO",
            "HARVESTMAN_BOAST", "HARVESTMAN_KEYWORDS", "HARVESTMAN_CREDITS", "HARVESTMAN_BROWSER_CSS",
            "HARVESTMAN_BROWSER_TABLE1", "HARVESTMAN_BROWSER_HEADER", "HARVESTMAN_BROWSER_TABLE2",
            "HARVESTMAN_BROWSER_TABLE3", "HARVESTMAN_CACHE_README"]

#============================== Start Browser page macro strings ================================================ #
HARVESTMAN_SIG="Daddy Long Legs"

HARVESTMAN_PROJECTINFO="""\
<TR align=center>
    <TD>
    %(PROJECTNAME)s
    </TD>
    <TD>&middot;
    <!-- PROJECTPAGE --><A HREF=\"%(PROJECTSTARTPAGE)s\"><!-- END -->
    <!-- PROJECTURL -->%(PROJECTURL)s<!-- END -->
        </A>
    </TD>
</TR>"""

HARVESTMAN_BOAST="""HarvestMan is an easy-to-use website copying utility. It allows you to download a website in the World Wide Web from the Internet to a local directory. It retrieves html, images, and other files from the remote server to your computer. It builds the local directory structures recursively, and rebuilds links relatively so that you can browse the local site without again connecting to the internet. The robot allows you to customize it in a variety of ways, filtering files based on file extensions/websites/keywords. The robot is customizable by using a configuration file. The program is completely written in Python."""

HARVESTMAN_KEYWORDS="""HarvestMan, HARVESTMAN, HARVESTMan, offline browser, robot, web-spider, website mirror utility, aspirateur web, surf offline, web capture, www mirror utility, browse offline, local  site builder, website mirroring, aspirateur www, internet grabber, capture de site web, internet tool, hors connexion, windows, windows 95, windows 98, windows nt, windows 2000, python apps, python tools, python spider"""

HARVESTMAN_CREDITS="""\
&copy; 2004-2005, Anand B Pillai. """


HARVESTMAN_BROWSER_CSS="""\
body {
    margin: 0;
    padding: 1;
    margin-bottom: 15px;
    margin-top: 15px;
    background: #678;
}
body, td {
    font: 14px Arial, Times, sans-serif;
    }

#subTitle {
    background: #345;  color: #fff;  padding: 4px;  font-weight: bold;
    }

#siteNavigation a, #siteNavigation .current {
    font-weight: bold;  color: #448;
    }
#siteNavigation a:link    { text-decoration: none; }
#siteNavigation a:visited { text-decoration: none; }

#siteNavigation .current { background-color: #ccd; }

#siteNavigation a:hover   { text-decoration: none;  background-color: #fff;  color: #000; }
#siteNavigation a:active  { text-decoration: none;  background-color: #ccc; }


a:link    { text-decoration: underline;  color: #00f; }
a:visited { text-decoration: underline;  color: #000; }
a:hover   { text-decoration: underline;  color: #c00; }
a:active  { text-decoration: underline; }

#pageContent {
    clear: both;
    border-bottom: 6px solid #000;
    padding: 10px;  padding-top: 20px;
    line-height: 1.65em;
    background-image: url(backblue.gif);
    background-repeat: no-repeat;
    background-position: top right;
    }

#pageContent, #siteNavigation {
    background-color: #ccd;
    }


.imgLeft  { float: left;   margin-right: 10px;  margin-bottom: 10px; }
.imgRight { float: right;  margin-left: 10px;   margin-bottom: 10px; }

hr { height: 1px;  color: #000;  background-color: #000;  margin-bottom: 15px; }

h1 { margin: 0;  font: 14px \"Monotype Corsiva\", Times, Arial;
font-weight: bold;  font-size: 2em; }
h2 { margin: 0;  font-weight: bold;  font-size: 1.6em; }
h3 { margin: 0;  font-weight: bold;  font-size: 1.3em; }
h4 { margin: 0;  font-weight: bold;  font-size: 1.18em; }

.blak { background-color: #000; }
.hide { display: none; }
.tableWidth { min-width: 400px; }

.tblRegular       { border-collapse: collapse; }
.tblRegular td    { padding: 6px;  background-image: url(fade.gif);  border: 2px solid #99c; }
.tblHeaderColor, .tblHeaderColor td { background: #99c; }
.tblNoBorder td   { border: 0; }"""

HARVESTMAN_BROWSER_TABLE1="""\
<table width=\"76%\" border=\"0\" align=\"center\" cellspacing=\"0\" cellpadding=\"3\" class=\"tableWidth\">
    <tr>
    <td id=\"subTitle\">HARVESTMan Internet Spider - Website Copier</td>
    </tr>
</table>"""

HARVESTMAN_BROWSER_HEADER="Index of Downloaded Sites:"

HARVESTMAN_BROWSER_TABLE2= """\
<table width=\"76%(PER)s\" border=\"0\" align=\"center\" cellspacing=\"0\" cellpadding=\"0\" class=\"tableWidth\">
<tr class=\"blak\">
<td>
    <table width=\"100%(PER)s\" border=\"0\" align=\"center\" cellspacing=\"1\" cellpadding=\"0\">
    <tr>
    <td colspan=\"6\">
        <table width=\"100%(PER)s\" border=\"0\" align=\"center\" cellspacing=\"0\" cellpadding=\"10\">
        <tr>
        <td id=\"pageContent\">
<!-- ==================== End prologue ==================== -->

    <meta name=\"generator\" content=\"HARVESTMAN Internet Spider Version %(VERSION)s \">
    <TITLE>Local index - HarvestMan</TITLE>
</HEAD>
<h1 ALIGN=left><u>%(HEADER)s</i></h1>
    <TABLE BORDER=\"0\" WIDTH=\"100%(PER)s\" CELLSPACING=\"1\" CELLPADDING=\"0\">
    <BR>
        <TR align=center>
            <TD>
            %(PROJECTNAME)s
            </TD>
            <TD>&middot;
                <!-- PROJECTPAGE --><A HREF=\"%(PROJECTSTARTPAGE)s\"><!-- END -->
                    <!-- PROJECTURL -->%(PROJECTURL)s<!-- END -->
                </A>
            </TD>
        </TR>
    </TABLE>
    <BR>
    <BR>
    <BR>
    <H6 ALIGN=\"RIGHT\">
    <I>Mirror and index made by HARVESTMan Internet Spider [ABP &amp; NK 2003]</I>
    </H6>
<!-- ==================== Start epilogue ==================== -->
    </td>
    </tr>
    </table>
    </td>
    </tr>
    </table>
</td>
</tr>
</table>"""

HARVESTMAN_BROWSER_TABLE3="""\
<table width=\"76%(PER)s\" border=\"0\" align=\"center\" valign=\"bottom\" cellspacing=\"0\" cellpadding=\"0\">
    <tr>
    <td id=\"footer\"><small>%(CREDITS)s </small></td>
    </tr>
</table>"""

HARVESTMAN_CACHE_README="""\
This directory contains important cache information for HarvestMan.
This information is used by HarvestMan to update the project files.
If you delete this directory or its contents, the project update/caching
mechanism wont work.

-The Harvesters (Aug 2003)
"""

#=================================== End Browser page macro strings ===========================================
class SingletonMetaClass(type):

    def __init__(cls,name,bases,dict):
        super(SingletonMetaClass,cls).__init__(name,bases,dict)
        original_new = cls.__new__

    def my_new(cls,*args,**kwds):
        if cls.instance == None:
            cls.instance = original_new(cls,*args,**kwds)
            return cls.instance
        cls.instance = None
        cls.__new__ = staticmethod(my_new)

class Registry(object):

    class __registrySingleton(object):

        __slots__ = ('ini', 'ofs', 'writeflag','USER_AGENT','userdebug','modfilename',
                     'oldnewmappings','mappings','config','connector','datamanager',
                     'ruleschecker', 'connectorfactory', 'cookiestore', 'trackerqueue',
                     'crawler')

        def __init__(self):
            self.ini = 0
            self.ofs = 0
            self.writeflag = 1
            self.USER_AGENT = 'HarvestMan 1.4'
            self.userdebug = []
            self.modfilename = ''
            self.oldnewmappings = {}
            self.mappings = { 'HarvestManStateObject' : 'config',
                              'HarvestManNetworkConnector' : 'connector',
                              'HarvestManUrlConnectorFactory' : 'connectorfactory',
                              'harvestManDataManager' : 'datamanager',
                              'harvestManRulesChecker' : 'ruleschecker',
                              'HarvestManCrawlerQueue' : 'trackerqueue',
                              'harvestMan' : 'crawler',
                              'CookieManager' : 'cookiestore'
                              }
            pass
        
        def __str__(self):
            return `self`

        def get_object_key(self, obj):
            """ Return the object key for HarvestMan objects """

            clsname = obj.__class__.__name__
            return self.mappings.get(clsname, '')

        def get_class_key(self, classname):
            """ Return the object key for HarvestMan classes """

            return self.mapping.get(classname)
        
            
    instance = None

    def __new__(cls): # __new__ always a classmethod
        if not Registry.instance:
            Registry.instance = Registry.__registrySingleton()
            
        return Registry.instance

    def __getattr__(self, name):
        try:
            return getattr(self.instance, name)
        except KeyError:
            return None

    def __setattr__(self, name):
        return setattr(self.instance, name)

# Single instance of the global lookup object
RegisterObj = Registry()

def GetRegistryObject():
    """ Return the registry object """
    
    return RegisterObj

def GetObject(objkey):
    """ Get the registered instance of the HarvestMan program
    object using its key <objkey> by looking up the global
    registry object """

    global RegisterObj
    if RegisterObj.ini==0:
        Initialize()

    try:
        obj = eval('RegisterObj.' + str(objkey))
        if type(obj) is 'instance':
            return weakref.proxy(obj)
        else:
            return obj
    except (KeyError, AttributeError), e:
        print e
        return None

def SetObject(obj):
    """ Set the instance <value> of the HarvestMan program object in
    the global registry object """

    global RegisterObj
    # Get the object key
    objkey = RegisterObj.get_object_key(obj)

    if objkey:
        s="".join(('RegisterObj', '.', str(objkey),'=', 'obj'))
        exec(s)

def SetConfig(configobject):
    """ Set the config object  """

    # This is no longer used,
    # instead use the more generic method
    # SetObject('config', value)

    global RegisterObj
    if RegisterObj.ini==0: Initialize()
    RegisterObj.config = configobject

def SetUserAgent(user_agent):
    """ Set the user agent """

    # This is no longer used.
    # Instead use the more generic method

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -