📄 utils.py
字号:
def __read_xml_project_file(self):
from HarvestManXMLParser import harvestManXMLParser
parser = harvestManXMLParser()
if parser.ParseProjectFile(GetObject('config').projectfile) != -1:
return 0
return -1
def __read_pickled_project_file(self):
config = GetObject('config')
filename = config.projectfile
try:
pickler = HarvestManPickler()
config.set_dictionary(pickler.load(filename))
return 0
except HarvestManPicklerError, e:
print e
return -1
def __write_pickled_project_file(self):
cfg = GetObject('config')
pckfile = os.path.join(cfg.basedir, cfg.project + '.hbp')
if os.path.exists(pckfile):
try:
os.remove(pckfile)
except OSError, e:
print e
return -1
try:
# We dont want to write the proxy password string to the
# project file, so save it and set dictionary value to "".
ppasswd = cfg.ppasswd
cfg.ppasswd=""
pickler = HarvestManPickler()
pickler.dump( cfg.__dict__, pckfile, False)
cfg.ppasswd=ppasswd
extrainfo('Done.')
except HarvestManPicklerError, e:
print e
return -1
moreinfo('Done.')
return 0
def __write_xml_project_file(self):
cfg = GetObject('config')
# The project file is written directly to the basedir
projfilename = os.path.join(cfg.basedir, cfg.project + '.hmp')
# If file already exists, shred it
if os.path.exists(projfilename):
try:
os.remove(projfilename)
except OSError, e:
print e
return -1
# Copy the HarvestMan DTD from the installation
dtdfile = os.path.join(cfg.basedir, "HarvestManProject.dtd")
if not os.path.exists(dtdfile):
try:
copy("./HarvestManProject.dtd", dtdfile)
except: # Catch all exceptions
return -1
extrainfo('Writing HarvestMan XML project file', projfilename, '...')
# Write the xml project file
try:
fs=file(projfilename, 'w')
fs.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
fs.write("<!DOCTYPE HarvestManProject SYSTEM \"HarvestManProject.dtd\">\n")
fs.write("<PROJECT Name=\"" + cfg.project + "\">\n")
fs.write("\t<URL Location=\"" + cfg.url + "\" />\n")
fs.write("\t<BASEDIR Location=\"" + cfg.basedir + "\" />\n")
for option in cfg.Options().keys():
# We are writing the project file using old style config variables
if option.find('.') != -1: continue
# Skip URL, BASEDIR, PROJECT
# Also skip any proxy password because of security reasons
if option in ('URL', 'PROJECT', 'BASEDIR', 'PROXYPASSWD'): continue
# For file type variables attribute name is Name, otherwise Value
attrib=""
if option in ('LOGFILE', 'URLSLISTFILE', 'ERRORFILE'):
attrib="Name"
else:
attrib="Value"
# Write the element and its attribute and its value
if option == 'URLFILTER': value = cfg.urlfilter
elif option == 'SERVERFILTER': value = cfg.serverfilter
else: value = cfg.getVariable(option)
fs.write("\t<" + option + " " + attrib + "=\"" + str(value) + "\" />\n")
fs.write("</PROJECT>\n")
fs.close()
extrainfo('Done.')
except Exception, e:
extrainfo(e)
return -1
return 0
class HarvestManBrowser:
""" Utility class to write the project browse pages """
def __init__(self):
tq = GetObject('trackerqueue')
self._projectstartpage = os.path.abspath(tq.get_base_urlobject().get_full_filename())
self._projectstartpage = 'file://' + self._projectstartpage.replace('\\', '/')
self._cfg = GetObject('config')
def make_project_browse_page(self):
""" This creates an xhtml page for browsing the downloaded html pages """
if self._cfg.browsepage == 0:
return
if self.__add_project_to_browse_page() == -1:
self.__make_initial_browse_page()
# Open the browser page in the user's webbrowser
info('Opening project in browser...')
import webbrowser
browsefile=os.path.join(self._cfg.basedir, 'index.html')
try:
webbrowser.open(browsefile)
debug('Done.')
except webbrowser.Error, e:
print e
return
def __add_project_to_browse_page(self):
""" Append new project information to existing project browser page """
browsefile=os.path.join(self._cfg.basedir, 'index.html')
if not os.path.exists(browsefile): return -1
# read contents of file
contents=''
try:
f=open(browsefile, 'r')
contents=f.read()
f.close()
except IOError, e:
print e
return -1
except OSError, e:
print e
return -1
if not contents: return -1
# See if this is a proper browse file created by HARVESTMan
index = contents.find("HARVESTMan SIG:")
if index == -1: return -1
sig=contents[(index+17):(index+32)].strip()
if sig != HARVESTMAN_SIG: return -1
# Locate position to insert project info
index = contents.find(HARVESTMAN_BROWSER_HEADER)
if index == -1: return -1
# get project page
index=contents.rfind('<!-- PROJECTPAGE -->', index)
if index == -1: return -1
newindex=contents.find('<!-- END -->', index)
projpage=contents[(index+29):(newindex-2)]
# get project url
index=contents.find('<!-- PROJECTURL -->', newindex)
if index == -1: return -1
newindex=contents.find('<!-- END -->', index)
prjurl=contents[(index+19):newindex]
if prjurl and prjurl==self._cfg.url:
debug('Duplicate project!')
if projpage:
newcontents=contents.replace(projpage,self._projectstartpage)
if prjurl:
newcontents=contents.replace(prjurl, self._cfg.url)
try:
f=open(browsefile, 'w')
f.write(newcontents)
f.close()
except OSError, e:
print e
return -1
else:
# find location of </TR> from this index
index = contents.find('</TR>', newindex)
if index==-1: return -1
newprojectinfo = HARVESTMAN_PROJECTINFO % {'PROJECTNAME': self._cfg.project,
'PROJECTSTARTPAGE': self._projectstartpage,
'PROJECTURL' : self._cfg.url }
# insert this string
newcontents = contents[:index] + '\n' + newprojectinfo + contents[index+5:]
try:
f=open(browsefile, 'w')
f.write(newcontents)
f.close()
except OSError, e:
print e
return -1
def __make_initial_browse_page(self):
""" This creates an xhtml page for browsing the downloaded
files similar to HTTrack copier """
debug('Making fresh page...')
cfg = GetObject('config')
browsefile=os.path.join(self._cfg.basedir, 'index.html')
f=open(browsefile, 'w')
f.write("<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\">\n\n")
f.write("<head>\n")
f.write("\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\" />\n")
f.write("\t<meta name=\"description\" content=\"" + HARVESTMAN_BOAST + "\" />\n")
f.write("\t<meta name=\"keywords\" content=\"" + HARVESTMAN_KEYWORDS + "\" />\n")
f.write("\t<title>Local index - HARVESTMAN Internet Spider</title>\n")
f.write("<!-- Mirror and index made by HARVESTMAN Internet Spider/" + cfg.version + " [ABP, NK '2003] -->\n")
f.write("<style type=\"text/css\">\n")
f.write("<!--\n\n")
f.write(HARVESTMAN_BROWSER_CSS)
f.write("\n\n")
f.write("// -->\n")
f.write("</style>\n")
f.write("</head>\n")
f.write(HARVESTMAN_BROWSER_TABLE1)
str=HARVESTMAN_BROWSER_TABLE2 % {'PER' : '%',
'VERSION': cfg.version,
'HEADER' : HARVESTMAN_BROWSER_HEADER,
'PROJECTNAME': self._cfg.project,
'PROJECTSTARTPAGE': self._projectstartpage,
'PROJECTURL' : self._cfg.url}
f.write(str)
f.write("<BR><BR><BR><BR>\n")
f.write("<HR width=76%>\n")
str=HARVESTMAN_BROWSER_TABLE3 % {'PER' : '%',
'CREDITS': HARVESTMAN_CREDITS }
f.write(str)
f.write("</body>\n")
# insert signature
sigstr = "<!-- HARVESTMan SIG: <" + HARVESTMAN_SIG + "> -->\n"
f.write(sigstr)
f.write("</html>\n")
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -