⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 utils.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 2 页
字号:
    def __read_xml_project_file(self):

        from HarvestManXMLParser import harvestManXMLParser

        parser = harvestManXMLParser()

        if parser.ParseProjectFile(GetObject('config').projectfile) != -1:
            return 0

        return -1

    def __read_pickled_project_file(self):

        config = GetObject('config')

        filename = config.projectfile

        try:
            pickler = HarvestManPickler()
            config.set_dictionary(pickler.load(filename))
            return 0
        except HarvestManPicklerError, e:
            print e
            return -1

    def __write_pickled_project_file(self):

        cfg = GetObject('config')

        pckfile = os.path.join(cfg.basedir, cfg.project + '.hbp')

        if os.path.exists(pckfile):
            try:
                os.remove(pckfile)
            except OSError, e:
                print e
                return -1

        try:
            # We dont want to write the proxy password string to the
            # project file, so save it and set dictionary value to "".
            ppasswd = cfg.ppasswd
            cfg.ppasswd=""
            pickler = HarvestManPickler()
            pickler.dump( cfg.__dict__, pckfile, False)
            cfg.ppasswd=ppasswd
            extrainfo('Done.')

        except HarvestManPicklerError, e:
            print e
            return -1

        moreinfo('Done.')
        return 0

    def __write_xml_project_file(self):

        cfg = GetObject('config')

        # The project file is written directly to the basedir
        projfilename = os.path.join(cfg.basedir, cfg.project + '.hmp')

        # If file already exists, shred it
        if os.path.exists(projfilename):
            try:
                os.remove(projfilename)
            except OSError, e:
                print e
                return -1

        # Copy the HarvestMan DTD from the installation
        dtdfile = os.path.join(cfg.basedir, "HarvestManProject.dtd")
        if not os.path.exists(dtdfile):
            try:
                copy("./HarvestManProject.dtd", dtdfile)
            except:  # Catch all exceptions
                return -1

        extrainfo('Writing HarvestMan XML project file', projfilename, '...')
        # Write the xml project file

        try:
            fs=file(projfilename, 'w')
            fs.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
            fs.write("<!DOCTYPE HarvestManProject SYSTEM \"HarvestManProject.dtd\">\n")
            fs.write("<PROJECT Name=\"" + cfg.project + "\">\n")
            fs.write("\t<URL Location=\"" + cfg.url + "\" />\n")
            fs.write("\t<BASEDIR Location=\"" + cfg.basedir + "\" />\n")

            for option in cfg.Options().keys():
                # We are writing the project file using old style config variables
                if option.find('.') != -1: continue
                # Skip URL, BASEDIR, PROJECT
                # Also skip any proxy password because of security reasons
                if option in ('URL', 'PROJECT', 'BASEDIR', 'PROXYPASSWD'): continue
                # For file type variables attribute name is Name, otherwise Value
                attrib=""
                if option in ('LOGFILE', 'URLSLISTFILE', 'ERRORFILE'):
                    attrib="Name"
                else:
                    attrib="Value"
                # Write the element and its attribute and its value
                if option == 'URLFILTER': value = cfg.urlfilter
                elif option == 'SERVERFILTER': value = cfg.serverfilter
                else: value = cfg.getVariable(option)
                fs.write("\t<" + option + " " + attrib + "=\"" + str(value) + "\" />\n")

                fs.write("</PROJECT>\n")
                fs.close()
                extrainfo('Done.')
        except Exception, e:
            extrainfo(e)
            return -1

        return 0


class HarvestManBrowser:
    """ Utility class to write the project browse pages """

    def __init__(self):
        tq = GetObject('trackerqueue')
        self._projectstartpage = os.path.abspath(tq.get_base_urlobject().get_full_filename())
        self._projectstartpage = 'file://' + self._projectstartpage.replace('\\', '/')
        self._cfg = GetObject('config')

    def make_project_browse_page(self):
        """ This creates an xhtml page for browsing the downloaded html pages """

        if self._cfg.browsepage == 0:
            return

        if self.__add_project_to_browse_page() == -1:
            self.__make_initial_browse_page()

        # Open the browser page in the user's webbrowser
        info('Opening project in browser...')
        import webbrowser

        browsefile=os.path.join(self._cfg.basedir, 'index.html')
        try:
            webbrowser.open(browsefile)
            debug('Done.')
        except webbrowser.Error, e:
            print e
        return 

    def __add_project_to_browse_page(self):
        """ Append new project information to existing project browser page """

        browsefile=os.path.join(self._cfg.basedir, 'index.html')
        if not os.path.exists(browsefile): return -1

        # read contents of file
        contents=''
        try:
            f=open(browsefile, 'r')
            contents=f.read()
            f.close()
        except IOError, e:
            print e
            return -1
        except OSError, e:
            print e
            return -1

        if not contents: return -1

        # See if this is a proper browse file created by HARVESTMan
        index = contents.find("HARVESTMan SIG:")
        if index == -1: return -1
        sig=contents[(index+17):(index+32)].strip()
        if sig != HARVESTMAN_SIG: return -1
        # Locate position to insert project info
        index = contents.find(HARVESTMAN_BROWSER_HEADER)
        if index == -1: return -1
        # get project page
        index=contents.rfind('<!-- PROJECTPAGE -->', index)
        if index == -1: return -1
        newindex=contents.find('<!-- END -->', index)
        projpage=contents[(index+29):(newindex-2)]
        # get project url
        index=contents.find('<!-- PROJECTURL -->', newindex)
        if index == -1: return -1

        newindex=contents.find('<!-- END -->', index)
        prjurl=contents[(index+19):newindex]

        if prjurl and prjurl==self._cfg.url:
            debug('Duplicate project!')
            if projpage:
                newcontents=contents.replace(projpage,self._projectstartpage)
            if prjurl:
                newcontents=contents.replace(prjurl, self._cfg.url)
            try:
                f=open(browsefile, 'w')
                f.write(newcontents)
                f.close()
            except OSError, e:
                print e
                return -1
        else:
            # find location of </TR> from this index
            index = contents.find('</TR>', newindex)
            if index==-1: return -1
            newprojectinfo = HARVESTMAN_PROJECTINFO % {'PROJECTNAME': self._cfg.project,
                                                       'PROJECTSTARTPAGE': self._projectstartpage,
                                                       'PROJECTURL' : self._cfg.url }
            # insert this string
            newcontents = contents[:index] + '\n' + newprojectinfo + contents[index+5:]
            try:
                f=open(browsefile, 'w')
                f.write(newcontents)
                f.close()
            except OSError, e:
                print e
                return -1

    def __make_initial_browse_page(self):
        """ This creates an xhtml page for browsing the downloaded
        files similar to HTTrack copier """

        debug('Making fresh page...')

        cfg = GetObject('config')
        browsefile=os.path.join(self._cfg.basedir, 'index.html')

        f=open(browsefile, 'w')
        f.write("<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\">\n\n")
        f.write("<head>\n")
        f.write("\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\" />\n")
        f.write("\t<meta name=\"description\" content=\"" + HARVESTMAN_BOAST + "\" />\n")
        f.write("\t<meta name=\"keywords\" content=\"" + HARVESTMAN_KEYWORDS + "\" />\n")
        f.write("\t<title>Local index - HARVESTMAN Internet Spider</title>\n")
        f.write("<!-- Mirror and index made by HARVESTMAN Internet Spider/" + cfg.version + " [ABP, NK '2003] -->\n")
        f.write("<style type=\"text/css\">\n")
        f.write("<!--\n\n")
        f.write(HARVESTMAN_BROWSER_CSS)
        f.write("\n\n")
        f.write("// -->\n")
        f.write("</style>\n")
        f.write("</head>\n")
        f.write(HARVESTMAN_BROWSER_TABLE1)
        str=HARVESTMAN_BROWSER_TABLE2 % {'PER'    : '%',
                                         'VERSION': cfg.version,
                                         'HEADER' : HARVESTMAN_BROWSER_HEADER,
                                         'PROJECTNAME': self._cfg.project,
                                         'PROJECTSTARTPAGE': self._projectstartpage,
                                         'PROJECTURL' : self._cfg.url}
        f.write(str)
        f.write("<BR><BR><BR><BR>\n")
        f.write("<HR width=76%>\n")
        str=HARVESTMAN_BROWSER_TABLE3 % {'PER'    : '%',
                                         'CREDITS': HARVESTMAN_CREDITS }
        f.write(str)
        f.write("</body>\n")

        # insert signature
        sigstr = "<!-- HARVESTMan SIG: <" + HARVESTMAN_SIG + "> -->\n"
        f.write(sigstr)
        f.write("</html>\n")







⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -