📄 common.py
字号:
""" Global functions for HarvestMan(R) Program.
This file is part of the HarvestMan(R) software.
For licensing information, see file LICENSE.TXT.
Author: Anand B Pillai (anandpillai at letterboxes dot org).
Created: Jun 10 2003
Jun 4-9 2004 Anand 1.4 development version.
*GetObject() returns a weakref
proxy instead of another reference
to the stored objects.
*Rewrote global registry class.
Renamed it to 'Registry' and made it
a normal singleton. Added __slots__.
Added methods get_object_key and
get_class_key.
*SetObject() finds out object key from
registry object, so an extra argument
is omitted.
Jun 14 2004 Anand 1.3.9 release.
"""
import weakref
import os, sys
import binascii
__all__ = [ "varprint", "info", "moreinfo", "extrainfo", "debug", "moredebug",
"wasOrWere", "plural", "filetype", "rename", "obfuscate", "unobfuscate",
"bin_crypt", "bin_decrypt", "htmlLikeFile", "Initialize", "Finish", "SetUserAgent",
"GetObject", "GetRegistryObject",
"SetObject", "SetUserDebug", "HARVESTMAN_SIG", "HARVESTMAN_PROJECTINFO",
"HARVESTMAN_BOAST", "HARVESTMAN_KEYWORDS", "HARVESTMAN_CREDITS", "HARVESTMAN_BROWSER_CSS",
"HARVESTMAN_BROWSER_TABLE1", "HARVESTMAN_BROWSER_HEADER", "HARVESTMAN_BROWSER_TABLE2",
"HARVESTMAN_BROWSER_TABLE3", "HARVESTMAN_CACHE_README"]
#============================== Start Browser page macro strings ================================================ #
HARVESTMAN_SIG="Daddy Long Legs"
HARVESTMAN_PROJECTINFO="""\
<TR align=center>
<TD>
%(PROJECTNAME)s
</TD>
<TD>·
<!-- PROJECTPAGE --><A HREF=\"%(PROJECTSTARTPAGE)s\"><!-- END -->
<!-- PROJECTURL -->%(PROJECTURL)s<!-- END -->
</A>
</TD>
</TR>"""
HARVESTMAN_BOAST="""HarvestMan is an easy-to-use website copying utility. It allows you to download a website in the World Wide Web from the Internet to a local directory. It retrieves html, images, and other files from the remote server to your computer. It builds the local directory structures recursively, and rebuilds links relatively so that you can browse the local site without again connecting to the internet. The robot allows you to customize it in a variety of ways, filtering files based on file extensions/websites/keywords. The robot is customizable by using a configuration file. The program is completely written in Python."""
HARVESTMAN_KEYWORDS="""HarvestMan, HARVESTMAN, HARVESTMan, offline browser, robot, web-spider, website mirror utility, aspirateur web, surf offline, web capture, www mirror utility, browse offline, local site builder, website mirroring, aspirateur www, internet grabber, capture de site web, internet tool, hors connexion, windows, windows 95, windows 98, windows nt, windows 2000, python apps, python tools, python spider"""
HARVESTMAN_CREDITS="""\
© 2004-2005, Anand B Pillai. """
HARVESTMAN_BROWSER_CSS="""\
body {
margin: 0;
padding: 1;
margin-bottom: 15px;
margin-top: 15px;
background: #678;
}
body, td {
font: 14px Arial, Times, sans-serif;
}
#subTitle {
background: #345; color: #fff; padding: 4px; font-weight: bold;
}
#siteNavigation a, #siteNavigation .current {
font-weight: bold; color: #448;
}
#siteNavigation a:link { text-decoration: none; }
#siteNavigation a:visited { text-decoration: none; }
#siteNavigation .current { background-color: #ccd; }
#siteNavigation a:hover { text-decoration: none; background-color: #fff; color: #000; }
#siteNavigation a:active { text-decoration: none; background-color: #ccc; }
a:link { text-decoration: underline; color: #00f; }
a:visited { text-decoration: underline; color: #000; }
a:hover { text-decoration: underline; color: #c00; }
a:active { text-decoration: underline; }
#pageContent {
clear: both;
border-bottom: 6px solid #000;
padding: 10px; padding-top: 20px;
line-height: 1.65em;
background-image: url(backblue.gif);
background-repeat: no-repeat;
background-position: top right;
}
#pageContent, #siteNavigation {
background-color: #ccd;
}
.imgLeft { float: left; margin-right: 10px; margin-bottom: 10px; }
.imgRight { float: right; margin-left: 10px; margin-bottom: 10px; }
hr { height: 1px; color: #000; background-color: #000; margin-bottom: 15px; }
h1 { margin: 0; font: 14px \"Monotype Corsiva\", Times, Arial;
font-weight: bold; font-size: 2em; }
h2 { margin: 0; font-weight: bold; font-size: 1.6em; }
h3 { margin: 0; font-weight: bold; font-size: 1.3em; }
h4 { margin: 0; font-weight: bold; font-size: 1.18em; }
.blak { background-color: #000; }
.hide { display: none; }
.tableWidth { min-width: 400px; }
.tblRegular { border-collapse: collapse; }
.tblRegular td { padding: 6px; background-image: url(fade.gif); border: 2px solid #99c; }
.tblHeaderColor, .tblHeaderColor td { background: #99c; }
.tblNoBorder td { border: 0; }"""
HARVESTMAN_BROWSER_TABLE1="""\
<table width=\"76%\" border=\"0\" align=\"center\" cellspacing=\"0\" cellpadding=\"3\" class=\"tableWidth\">
<tr>
<td id=\"subTitle\">HARVESTMan Internet Spider - Website Copier</td>
</tr>
</table>"""
HARVESTMAN_BROWSER_HEADER="Index of Downloaded Sites:"
HARVESTMAN_BROWSER_TABLE2= """\
<table width=\"76%(PER)s\" border=\"0\" align=\"center\" cellspacing=\"0\" cellpadding=\"0\" class=\"tableWidth\">
<tr class=\"blak\">
<td>
<table width=\"100%(PER)s\" border=\"0\" align=\"center\" cellspacing=\"1\" cellpadding=\"0\">
<tr>
<td colspan=\"6\">
<table width=\"100%(PER)s\" border=\"0\" align=\"center\" cellspacing=\"0\" cellpadding=\"10\">
<tr>
<td id=\"pageContent\">
<!-- ==================== End prologue ==================== -->
<meta name=\"generator\" content=\"HARVESTMAN Internet Spider Version %(VERSION)s \">
<TITLE>Local index - HarvestMan</TITLE>
</HEAD>
<h1 ALIGN=left><u>%(HEADER)s</i></h1>
<TABLE BORDER=\"0\" WIDTH=\"100%(PER)s\" CELLSPACING=\"1\" CELLPADDING=\"0\">
<BR>
<TR align=center>
<TD>
%(PROJECTNAME)s
</TD>
<TD>·
<!-- PROJECTPAGE --><A HREF=\"%(PROJECTSTARTPAGE)s\"><!-- END -->
<!-- PROJECTURL -->%(PROJECTURL)s<!-- END -->
</A>
</TD>
</TR>
</TABLE>
<BR>
<BR>
<BR>
<H6 ALIGN=\"RIGHT\">
<I>Mirror and index made by HARVESTMan Internet Spider [ABP & NK 2003]</I>
</H6>
<!-- ==================== Start epilogue ==================== -->
</td>
</tr>
</table>
</td>
</tr>
</table>
</td>
</tr>
</table>"""
HARVESTMAN_BROWSER_TABLE3="""\
<table width=\"76%(PER)s\" border=\"0\" align=\"center\" valign=\"bottom\" cellspacing=\"0\" cellpadding=\"0\">
<tr>
<td id=\"footer\"><small>%(CREDITS)s </small></td>
</tr>
</table>"""
HARVESTMAN_CACHE_README="""\
This directory contains important cache information for HarvestMan.
This information is used by HarvestMan to update the project files.
If you delete this directory or its contents, the project update/caching
mechanism wont work.
-The Harvesters (Aug 2003)
"""
#=================================== End Browser page macro strings ===========================================
class SingletonMetaClass(type):
def __init__(cls,name,bases,dict):
super(SingletonMetaClass,cls).__init__(name,bases,dict)
original_new = cls.__new__
def my_new(cls,*args,**kwds):
if cls.instance == None:
cls.instance = original_new(cls,*args,**kwds)
return cls.instance
cls.instance = None
cls.__new__ = staticmethod(my_new)
class Registry(object):
class __registrySingleton(object):
__slots__ = ('ini', 'ofs', 'writeflag','USER_AGENT','userdebug','modfilename',
'oldnewmappings','mappings','config','connector','datamanager',
'ruleschecker', 'connectorfactory', 'cookiestore', 'trackerqueue',
'crawler')
def __init__(self):
self.ini = 0
self.ofs = 0
self.writeflag = 1
self.USER_AGENT = 'HarvestMan 1.4'
self.userdebug = []
self.modfilename = ''
self.oldnewmappings = {}
self.mappings = { 'HarvestManStateObject' : 'config',
'HarvestManNetworkConnector' : 'connector',
'HarvestManUrlConnectorFactory' : 'connectorfactory',
'harvestManDataManager' : 'datamanager',
'harvestManRulesChecker' : 'ruleschecker',
'HarvestManCrawlerQueue' : 'trackerqueue',
'harvestMan' : 'crawler',
'CookieManager' : 'cookiestore'
}
pass
def __str__(self):
return `self`
def get_object_key(self, obj):
""" Return the object key for HarvestMan objects """
clsname = obj.__class__.__name__
return self.mappings.get(clsname, '')
def get_class_key(self, classname):
""" Return the object key for HarvestMan classes """
return self.mapping.get(classname)
instance = None
def __new__(cls): # __new__ always a classmethod
if not Registry.instance:
Registry.instance = Registry.__registrySingleton()
return Registry.instance
def __getattr__(self, name):
try:
return getattr(self.instance, name)
except KeyError:
return None
def __setattr__(self, name):
return setattr(self.instance, name)
# Single instance of the global lookup object
RegisterObj = Registry()
def GetRegistryObject():
""" Return the registry object """
return RegisterObj
def GetObject(objkey):
""" Get the registered instance of the HarvestMan program
object using its key <objkey> by looking up the global
registry object """
global RegisterObj
if RegisterObj.ini==0:
Initialize()
try:
obj = eval('RegisterObj.' + str(objkey))
if type(obj) is 'instance':
return weakref.proxy(obj)
else:
return obj
except (KeyError, AttributeError), e:
print e
return None
def SetObject(obj):
""" Set the instance <value> of the HarvestMan program object in
the global registry object """
global RegisterObj
# Get the object key
objkey = RegisterObj.get_object_key(obj)
if objkey:
s="".join(('RegisterObj', '.', str(objkey),'=', 'obj'))
exec(s)
def SetConfig(configobject):
""" Set the config object """
# This is no longer used,
# instead use the more generic method
# SetObject('config', value)
global RegisterObj
if RegisterObj.ini==0: Initialize()
RegisterObj.config = configobject
def SetUserAgent(user_agent):
""" Set the user agent """
# This is no longer used.
# Instead use the more generic method
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -