📄 datamgr.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 2 页
字号:
上一页 12
            configobj = GetObject('config')

            if urlObject is None: return -1

            # Bug: we should be getting this url as rooturl and not
            # the base url of this url.
            filename, rooturl = urlObject.get_full_filename(), urlObject.get_full_url()

            ok=False

            # Status == 1 or 2 means look up in "_savedfiles"
            # Status == 3 means look up in "_cacheinfo"
            lookuplist=[]
            if status == 1 or status == 2:
                lookuplist = self._downloaddict['_savedfiles']
            elif status == 3:
                lookuplist = self._downloaddict['_cacheinfo']
            else:
                return -1

            for x in lookuplist:
                # Already added
                if x[0] == filename:
                    ok=True
                    break

            if not ok:
                lookuplist.append( filename )

        finally:
            self._dataLock.release()

        lsaved = len(self._downloaddict['_savedfiles'])
        lmax = configobj.maxfiles

        if lsaved == lmax:
            moreinfo('Specified file limit of', lmax ,'reached!')

            # Get tracker queue object
            tq = GetObject('trackerqueue')
            tq.killTrackers()

        # see if some tracker still managed to download
        # files while we were killing, then delete them!
        if lsaved > lmax:
            diff = lsaved - lmax
            savedcopy = (self._downloaddict['_savedfiles'])[0:]

            for x in range(0, diff):
                # 2 bugs: fixed a bug where the deletion
                # was not controlled
                lastfile = savedcopy[lsaved - x -1]

                # sometimes files may not be there, attempt
                # to delete only if file is there (makes sense)
                if os.path.exists(lastfile):
                    try:
                        extrainfo('Deleting file ', lastfile)
                        os.remove(lastfile)
                        (self._downloaddict['_deletedfiles']).append(lastfile)
                        # bugfix: If any files are deleted, remove it from this list.
                        self._downloaddict['_savedfiles'].remove(lastfile)
                    except (OSError, IndexError, ValueError), e:
                        print e

    def update_links(self, filename, urlobj):
        """ Update the links dictionary for this html file """

        try:
            self._dataLock2.acquire()
            if self._linksdict.has_key(filename):
                links = self._linksdict[filename]
                try:
                    links.index(urlobj)
                except:
                    links.append(urlobj)
            else:
                l=[urlobj]
                self._linksdict[filename] = l
        finally:
            self._dataLock2.release()

    def thread_download(self, urlObj):
        """ Download this url object in a separate thread """

        # Add this task to the url thread pool
        self._urlThreadPool.push( urlObj )

    def has_download_threads(self):
        """ Return true if there are any download sub-threads
        running, else return false """

        num_threads = self._urlThreadPool.has_busy_threads()
        if num_threads:
            return True

        return False

    def last_download_thread_report_time(self):
        """ Get the time stamp of the last completed
        download (sub) thread """

        return self._urlThreadPool.last_thread_report_time()

    def kill_download_threads(self):
        """ Terminate all the download threads """

        self._urlThreadPool.end_all_threads()

    def create_local_directory(self, urlObj):
        """ Create the directories on the disk for downloading
        this url object """

        directory =  urlObj.get_local_directory()
        try:
            if not os.path.isdir( directory ):
                if not os.path.exists( directory ):
                    os.makedirs( directory )
                    extrainfo("Created => ", directory)
            return 0
        except OSError, e:
            moreinfo("OS Exception", e)
            return -1

        return 0

    def download_url(self, urlobj):

        try:
            data=""
            configobj = GetObject('config')

            if not configobj.usethreads or urlobj.is_webpage():
                # Connector object
                # New Feature: use cached connections
                # Check the connector cache dictionary for a previous
                # connection of this url.
                new_connector = False

                server = urlobj.get_domain()
                try:
                    conn = self._connectorcache[urlobj]
                except:
                    conn_factory = GetObject('connectorfactory')
                    # This call will block if we exceed the number of connections
                    # moreinfo("Creating connector for url ", urlobj.get_full_url())
                    conn = conn_factory.create_connector( server )
                    new_connector = True

                res = conn.save_url( urlobj, new_connector )
                
                # Remove the connector from the factory
                if new_connector:
                    conn_factory.remove_connector(conn, server)

                # Return values for res
                # 0 => error, file not downloaded
                # 1 => file downloaded ok
                # 2 => file downloaded with filename modification
                # 3 => file was not downloaded because cache was uptodate
                filename = urlobj.get_full_filename()
                if res:
                    if res==2:
                        # There was a filename modification, so get the new filename
                        filename = GetObject('modfilename')
                    else:
                        filename = urlobj.get_full_filename()

                    if res==1:
                        moreinfo("Saved to ", filename)

                    self.update_file_stats( urlobj, res )

                    # Get the data fetched and return it
                    if urlobj.is_webpage():
                        data=conn.get_data()
                else:
                    fetchurl = urlobj.get_full_url()
                    extrainfo( "Failed to download url", fetchurl)
                    # We dont re-fetch fatal errors
                    err = conn.get_error()
                    # print 'Error number => ', err['number']
                    if not err['fatal']:
                        self.update_failed_files(urlobj)

            else:
                self.thread_download( urlobj )

            return data

        finally:
            pass


    def is_file_downloaded(self, filename):
        """ Find if the <filename> is present in the
        saved files list """

        yes=0
        abspath1=os.path.abspath(filename)
        for filename in self._downloaddict['_savedfiles']:
            abspath2=os.path.abspath(filename)
            if abspath1==abspath2:
                yes=1
                break

        return yes

    def localise_links(self):
        """ Localise all links (urls) of the downloaded html pages """

        info('Localising links of downloaded html files ...')

        dmgr = GetObject('datamanager')

        for filename in dmgr.get_links_dictionary().keys():
            debug("Localising links for ", filename)
            self.localise_file_links(filename)

    def localise_file_links(self, filename):
        """ Localise links for this file """

        # open file
        try:
            f=open(filename, 'r')
        except IOError, e:
            debug('IOException: ', str(e))
            return -1

        configobj = GetObject('config')

        data=''

        while (1):
            try:
                l = f.readline()
                if l=='': break
                # Bugfix: skip the BASE HREF line
                if l.lower().find("<base href") != -1: continue
                data += l
            except (IOError, EOFError), e:
                print e
                return -1

        # close file
        f.close()

        dmgr = GetObject('datamanager')
        links = (dmgr.get_links_dictionary())[filename]

        for u in links:

            url_object = u

            # bug fix, dont localize cgi links
            if url_object.is_cgi() or not url_object.is_filename_url(): continue
            typ = url_object.get_type()

            v = url_object.get_url()
            
            configobj = GetObject('config')

            # if the link is relative, and we dont need
            # absolute localization, we dont need to do
            # anything.
            if configobj.localise==2:
                if url_object.is_relative_path() and not url_object.is_relative_to_server():
                    continue

            fullfilename = os.path.abspath( url_object.get_full_filename() )

            urlfilename=''
            # Modification: localisation w.r.t relative pathnames
            if configobj.localise==2:
                urlfilename = url_object.get_relative_filename()
            elif configobj.localise==1:
                urlfilename = fullfilename

            # modification, get any filename mappings from
            # HarvestManGlobals module (probably a dynamically generated file
            # which was renamed)
            try:
                oldnewmappings = GetObject('oldnewmappings')
                newfilename = oldnewmappings[fullfilename]
                if configobj.localise==2:
                    urlfilename = (os.path.split(newfilename))[1]
                elif configobj.localise==1:
                    urlfilename = os.path.abspath(newfilename)
            except KeyError:
                urlfilename = urlfilename

            # replace '\\' with '/'
            urlfilename = urlfilename.replace('\\','/')

            newurl=''

            # bug: if we cannot get the filenames, replace
            # relative url paths will full url paths so that
            # the user can connect to them.
            if not os.path.exists(fullfilename):
                # moreinfo("Path does not exist !", fullfilename)
                # for relative links, replace it with the
                # full url path
                fullurlpath = url_object.get_full_url_sans_port()
                newurl = "href=\"" + fullurlpath + "\""
            else:
                # replace url with urlfilename
                # bug: fix for anchor links
                if typ == 'anchor':
                    urlfilename += url_object.get_anchor()

                if configobj.localise == 1:
                    newurl= "href=\"" + "file://" + urlfilename + "\""
                else:
                    newurl= "href=\"" + urlfilename + "\""

            # Get the location of the link in the file
            oldurl1 = "href=\"" + v + "\""
            # oldurl2 = "HREF=\"" + v + "\""
            try:
                data=data.replace(oldurl1, newurl)
                data=data.replace(oldurl2, newurl)
            except:
                continue

        try:
            fw=open(filename, 'w')
        except IOError, e:
            debug('IOException: ', str(e))
            return -1

        fw.write(data)
        fw.close()

    def print_project_info(self, statsd):
        """ Print project information """

        nlinks = statsd['links']
        nservers = statsd['extservers'] + 1
        nfiles = statsd['files']
        ndirs = statsd['extdirs'] + 1
        numfailed = statsd['failed']
        nretried = statsd['retries']
        fatal = statsd['fatal']
        fetchtime = statsd['fetchtime']
        nfilesincache = statsd['filesincache']

        # Bug fix, download time to be calculated
        # precisely...
        cfg = GetObject('config')

        dnldtime = fetchtime

        strings = [('link', nlinks), ('server', nservers), ('file', nfiles), ('file', nfilesincache),
                   ('directory', ndirs), ('link', numfailed), ('link', fatal),
                   ('link', nretried) ]

        fns = map(plural, strings)
        info(' ')

        if fetchtime and nfiles:
            fps = (float(nfiles/dnldtime))
            fps = float((math.modf(fps*100.0))[1]/100.0)
        else:
            fps=0.0

        bytes = self._bytes

        ratespec='KB/sec'
        if bytes and dnldtime:
            bps = (float(bytes/dnldtime))/100.0
            bps = float((math.modf(bps*100.0))[1]/1000.0)
            if bps<1.0:
                bps *= 1000.0
                ratespec='bytes/sec'
        else:
            bps = 0.0

        configobj = GetObject('config')

        info('HarvestMan mirror',configobj.project,'completed in',fetchtime,'seconds.')
        if nlinks: info(nlinks,fns[0],'scanned in',nservers,fns[1],',',nfiles,fns[2],'written.')
        else: info('No links scanned, no file written.\n')
        if nfilesincache:
            info(nfilesincache,fns[3],wasOrWere(nfilesincache),' already uptodate in the cache for this project and',wasOrWere(nfilesincache),'not updated.')
        if fatal: info(fatal,fns[6],'had fatal errors and failed to download.')
        if bytes: info(bytes,'bytes received at the rate of',bps,ratespec,'.\n')

        nlocked = GetObject('trackerqueue').get_locked_instances()
        debug('(Thread Locking situation was avoided ', nlocked, 'times.)')

        # get current time stamp
        s=time.localtime()

        tz=(time.tzname)[0]

        format='%b %d %Y '+tz+' %H:%M:%S'
        tstamp=time.strftime(format, s)
        # Write stats to a stats file
        statsfile = configobj.project + '.hst'
        statsfile = os.path.abspath(os.path.join(configobj.projdir, statsfile))
        print 'Writing stats file ', statsfile , '...'
        # Append to files contents
        sf=open(statsfile, 'a')

        # Write url, file count, links count, time taken,
        # files per second, failed file count & time stamp
        infostr='url:'+configobj.url+','
        infostr +='files:'+str(nfiles)+','
        infostr +='links:'+str(nlinks)+','
        infostr +='dirs:'+str(ndirs)+','
        infostr +='failed:'+str(numfailed)+','
        infostr +='refetched:'+str(nretried)+','
        infostr +='fatal:'+str(fatal)+','
        infostr +='elapsed:'+str(fetchtime)+','
        infostr +='fps:'+str(fps)+','
        infostr +='bps:'+str(bps)+','
        infostr +='timestamp:'+tstamp
        infostr +='\n'

        sf.write(infostr)
        sf.close()

        print 'Done.'
上一页 12
💿 文件大小 153 K
👤 上传用户 xiaoexiao
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#网络
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -