⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 datamgr.py

📁 Harvestman-最新版本
💻 PY
📖 第 1 页 / 共 4 页
字号:
        orig_url = urlobj.get_full_url()        old_urlobj = urlobj.get_original_state()        domain_changed_a_lot = False                # If this was a re-directed URL, check if there is a        # considerable change in the domains. If there is,        # there is a very good chance that the original URL        # is redirecting to mirrors, so we can split on        # the original URL and this would automatically        # produce a split-mirror download without us having        # to do any extra work!        if urlobj.redirected and old_urlobj != None:            old_domain = old_urlobj.get_domain()            if old_domain != domain:                # Check if it is somewhat similar                # if domain.find(old_domain) == -1:                domain_changed_a_lot = True        try:            self._serversdict[domain]        except KeyError:            self._serversdict[domain] = {'accept-ranges': True}        if self.mirrormgr.mirrors_available(urlobj):            return self.mirrormgr.download_multipart_url(urlobj, clength, self._cfg.numparts, self._urlThreadPool)        else:            if domain_changed_a_lot:                urlobj = old_urlobj                # Set a flag to indicate this                urlobj.redirected_old = True                        parts = self._cfg.numparts        # Calculate size of each piece        piecesz = clength/parts                # Calculate size of each piece        pcsizes = [piecesz]*parts        # For last URL add the reminder        pcsizes[-1] += clength % parts         # Create a URL object for each and set range        urlobjects = []        for x in range(parts):            urlobjects.append(copy.copy(urlobj))        prev = 0        for x in range(parts):            curr = pcsizes[x]            next = curr + prev            urlobject = urlobjects[x]            # Set mirror_url attribute            urlobject.mirror_url = urlobj            urlobject.trymultipart = True            urlobject.clength = clength            urlobject.range = (prev, next-1)            urlobject.mindex = x            prev = next            self._urlThreadPool.push(urlobject)                    # Push this URL objects to the pool        return URL_PUSHED_TO_POOL    def download_url(self, caller, url):        no_threads = (not self._cfg.usethreads) or \                     url.is_webpage() or \                     url.is_stylesheet()        data=""        if no_threads:            # This call will block if we exceed the number of connections            url.qstatus = urlparser.URL_QUEUED                        conn = objects.connfactory.create_connector()            # Set status to queued            url.qstatus = urlparser.URL_IN_QUEUE                        res = conn.save_url( url )                        objects.connfactory.remove_connector(conn)            filename = url.get_full_filename()            if res != CONNECT_NO_ERROR:                filename = url.get_full_filename()                self.update_file_stats( url, res )                if res==DOWNLOAD_YES_OK:                    info("Saved",filename)                if url.is_webpage():                    if self._cfg.datamode==CONNECTOR_DATA_MODE_INMEM:                         data = conn.get_data()                    elif os.path.isfile(filename):                        # Need to read data from the file...                        data = open(filename, 'rb').read()            else:                fetchurl = url.get_full_url()                extrainfo( "Failed to download url", fetchurl)            self._urldb.update(url.index, url)                    else:            # Set status to queued            self.thread_download( url )        return data    def clean_up(self):        """ Purge data for a project by cleaning up        lists, dictionaries and resetting other member items"""        # Reset byte count        if self._urldb and self._urldb.size:            del self._urldb        if self.collections and self.collections.size:            del self.collections        self.reset()    def archive_project(self):        """ Archive project files into a tar archive file.        The archive will be further compressed in gz or bz2        format """        extrainfo("Archiving project files...")        # Get project directory        projdir = self._cfg.projdir        # Get archive format        if self._cfg.archformat=='bzip':            format='bz2'        elif self._cfg.archformat=='gzip':            format='gz'        else:            error("Archive Error: Archive format not recognized")            return INVALID_ARCHIVE_FORMAT        # Create tarfile name        ptarf = os.path.join(self._cfg.basedir, "".join((self._cfg.project,'.tar.',format)))        cwd = os.getcwd()        os.chdir(self._cfg.basedir)        # Create tarfile object        tf = tarfile.open(ptarf,'w:'+format)        # Projdir base name        pbname = os.path.basename(projdir)        # Add directories        for item in os.listdir(projdir):            # Skip cache directory, if any            if item=='hm-cache':                continue            # Add directory            fullpath = os.path.join(projdir,item)            if os.path.isdir(fullpath):                tf.add(os.path.join(pbname,item))        # Dump the tarfile        tf.close()        os.chdir(cwd)                    # Check whether writing was done        if os.path.isfile(ptarf):            extrainfo("Wrote archive file",ptarf)            return FILE_WRITE_OK        else:            error("Error in writing archive file",ptarf)            return FILE_WRITE_ERROR                def add_headers_to_cache(self):        """ Add original URL headers of urls downloaded        as an entry to the cache file """                # Navigate in pre-order, i.e in the order of insertion...        for node in self.collections.preorder():            coll = node.get()            # Get list of links for this collection            for urlobjidx in coll.getAllURLs():                urlobj = self.get_url(urlobjidx)                if urlobj==None: continue                                url = urlobj.get_full_url()                # Get headers                headers = urlobj.get_url_content_info()                                if headers:                    content = self.cache._url[url]                    if content:                        urldict = content[0]                        urldict['headers'] = headers    def dump_headers(self):        """ Dump the headers of the web pages        downloaded, into a DBM file """                # print dbmfile        extrainfo("Writing url headers database")                        headersdict = {}        for node in self.collections.preorder():            coll = node.get()                        for urlobjidx in coll.getAllURLs():                urlobj = self.get_url(urlobjidx)                                if urlobj:                    url = urlobj.get_full_url()                    # Get headers                    headers = urlobj.get_url_content_info()                    if headers:                        headersdict[url] = str(headers)                                cache = utils.HarvestManCacheReaderWriter(self.get_proj_cache_directory())        return cache.write_url_headers(headersdict)        def localise_links(self):        """ Localise all links (urls) of the downloaded html pages """        # Dont confuse 'localising' with language localization.        # This means just converting the outward (Internet) pointing        # URLs in files to local files.        info('Localising links of downloaded web pages...',)        count = 0        localized = []                for node in self.collections.preorder():            coll = node.get()                        sourceurl = self.get_url(coll.getSourceURL())            childurls = [self.get_url(index) for index in coll.getAllURLs()]            filename = sourceurl.get_full_filename()            if (not filename in localized) and os.path.exists(filename):                extrainfo('Localizing links for',filename)                if SUCCESS(self.localise_file_links(filename, childurls)):                    count += 1                    localized.append(filename)        info('Localised links of',count,'web pages.')    def localise_file_links(self, filename, links):        """ Localise links for this file """        data=''                try:            fw=open(filename, 'r+')            data=fw.read()            fw.seek(0)            fw.truncate(0)        except (OSError, IOError),e:            return FILE_TRUNCATE_ERROR        # Regex1 to replace ( at the end        r1 = re.compile(r'\)+$')        r2 = re.compile(r'\(+$')                        # MOD: Replace any <base href="..."> line        basehrefre = re.compile(r'<base href=.*>', re.IGNORECASE)        if basehrefre.search(data):            data = re.sub(basehrefre, '', data)                for u in links:            if not u: continue                        url_object = u            typ = url_object.get_type()            if url_object.is_image():                http_str="src"            else:                http_str="href"            v = url_object.get_original_url()            if v == '/': continue            # Somehow, some urls seem to have an            # unbalanced parantheses at the end.            # Remove it. Otherwise it will crash            # the regular expressions below.            v = r1.sub('', v)            v2 = r2.sub('', v)                        # Bug fix, dont localize cgi links            if typ != 'base':                if url_object.is_cgi():                     continue                                fullfilename = os.path.abspath( url_object.get_full_filename() )                urlfilename=''                # Modification: localisation w.r.t relative pathnames                if self._cfg.localise==2:                    urlfilename = url_object.get_relative_filename()                elif self._cfg.localise==1:                    urlfilename = fullfilename                # replace '\\' with '/'                urlfilename = urlfilename.replace('\\','/')                newurl=''                oldurl=''                            # If we cannot get the filenames, replace                # relative url paths will full url paths so that                # the user can connect to them.                if not os.path.exists(fullfilename):                    # for relative links, replace it with the                    # full url path                    fullurlpath = url_object.get_full_url_sans_port()                    newurl = "href=\"" + fullurlpath + "\""                else:                    # replace url with urlfilename                    if typ == 'anchor':                        anchor_part = url_object.get_anchor()                        urlfilename = "".join((urlfilename, anchor_part))                        # v = "".join((v, anchor_part))                    if self._cfg.localise == 1:                        newurl= "".join((http_str, "=\"", "file://", urlfilename, "\""))                    else:                        newurl= "".join((http_str, "=\"", urlfilename, "\""))            else:                newurl="".join((http_str,"=\"","\""))            if typ != 'img':                oldurl = "".join((http_str, "=\"", v, "\""))                try:                    oldurlre = re.compile("".join((http_str,'=','\\"?',v,'\\"?')))                except Exception, e:                    debug("Error:",str(e))                    continue                                    # Get the location of the link in the file                try:                    if oldurl != newurl:                        data = re.sub(oldurlre, newurl, data,1)                except Exception, e:                    debug("Error:",str(e))                    continue            else:                try:                    oldurlre1 = "".join((http_str,'=','\\"?',v,'\\"?'))

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -