📄 adaptiverevisitfrontier.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
            try{                            long completeTime = curi.getLong(A_FETCH_COMPLETED_TIME);                long durationTaken =                     (completeTime - curi.getLong(A_FETCH_BEGAN_TIME));                                durationToWait = (long)(                        ((Float) getAttribute(ATTR_DELAY_FACTOR, curi))                            .floatValue() * durationTaken);                    long minDelay =                     ((Integer) getAttribute(ATTR_MIN_DELAY, curi)).longValue();                                if (minDelay > durationToWait) {                    // wait at least the minimum                    durationToWait = minDelay;                }                    long maxDelay = ((Integer) getAttribute(ATTR_MAX_DELAY, curi)).longValue();                if (durationToWait > maxDelay) {                    // wait no more than the maximum                    durationToWait = maxDelay;                }            } catch (AttributeNotFoundException e) {                logger.severe("Unable to find attribute. " +                         curi.toString());                //Wait for max interval.                durationToWait = DEFAULT_MAX_DELAY.longValue();            }        }        long ret = durationToWait > DEFAULT_MIN_DELAY.longValue() ?                 durationToWait : DEFAULT_MIN_DELAY.longValue();        logger.finest("Snooze time for " + curi.toString() + " = " + ret );        return ret;    }    /* (non-Javadoc)     * @see org.archive.crawler.framework.Frontier#discoveredUriCount()     */    public synchronized long discoveredUriCount() {        return (this.alreadyIncluded != null) ?                 this.alreadyIncluded.count() : hostQueues.getSize();    }    /* (non-Javadoc)     * @see org.archive.crawler.framework.Frontier#queuedUriCount()     */    public synchronized long queuedUriCount() {        return hostQueues.getSize();    }    /* (non-Javadoc)     * @see org.archive.crawler.framework.Frontier#finishedUriCount()     */    public long finishedUriCount() {        return succeededFetchCount+failedFetchCount+disregardedUriCount;    }    /* (non-Javadoc)     * @see org.archive.crawler.framework.Frontier#succeededFetchCount()     */    public long succeededFetchCount() {        return succeededFetchCount;    }    /* (non-Javadoc)     * @see org.archive.crawler.framework.Frontier#failedFetchCount()     */    public long failedFetchCount() {        return failedFetchCount;    }    /* (non-Javadoc)     * @see org.archive.crawler.framework.Frontier#disregardedUriCount()     */    public long disregardedUriCount() {        return disregardedUriCount++;    }    /* (non-Javadoc)     * @see org.archive.crawler.framework.Frontier#totalBytesWritten()     */    public long totalBytesWritten() {        return totalProcessedBytes;    }    /**     * Method is not supported by this Frontier implementation..     * @param pathToLog     * @throws IOException     */    public void importRecoverLog(String pathToLog) throws IOException {        throw new IOException("Unsupported by this frontier.");    }    public synchronized FrontierMarker getInitialMarker(String regexpr,            boolean inCacheOnly) {        return null;    }    /* (non-Javadoc)     * @see org.archive.crawler.framework.Frontier#getURIsList(org.archive.crawler.framework.FrontierMarker, int, boolean)     */    public synchronized ArrayList getURIsList(FrontierMarker marker,            int numberOfMatches, boolean verbose)        throws InvalidFrontierMarkerException {        // TODO Auto-generated method stub        return null;    }    /* (non-Javadoc)     * @see org.archive.crawler.framework.Frontier#deleteURIs(java.lang.String)     */    public synchronized long deleteURIs(String match) {        // TODO Auto-generated method stub        return 0;    }    /* (non-Javadoc)     * @see org.archive.crawler.framework.Frontier#deleted(org.archive.crawler.datamodel.CrawlURI)     */    public synchronized void deleted(CrawlURI curi) {        // TODO Auto-generated method stub    }    public void considerIncluded(UURI u) {        // This will cause the URI to be crawled!!!        CrawlURI curi = new CrawlURI(u);        innerSchedule(curi);    }    public void kickUpdate() {        loadSeeds();    }        public void start() {        unpause();     }        synchronized public void pause() {         shouldPause = true;        notifyAll();    }    synchronized public void unpause() {         shouldPause = false;        notifyAll();    }    synchronized public void terminate() {         shouldTerminate = true;    }      /* (non-Javadoc)     * @see org.archive.crawler.framework.Frontier#getFrontierJournal()     */    public FrontierJournal getFrontierJournal() {        return null;    }    private static class ThreadLocalQueue    extends ThreadLocal<Queue<CandidateURI>> implements Serializable {        private static final long serialVersionUID = 8268977225156462059L;        protected Queue<CandidateURI> initialValue() {            return new MemQueue<CandidateURI>();        }        /**         * @return Queue of 'batched' items         */        public Queue<CandidateURI> getQueue() {            return get();        }    }        /**     * This method is not supported by this Frontier implementation     * @param pathToLog     * @param retainFailures     * @throws IOException     */    public void importRecoverLog(String pathToLog, boolean retainFailures)    throws IOException {        throw new IOException("Unsupported");    }    //    // Reporter implementation    //        public String[] getReports() {        // none but default for now        return new String[] {};    }        /* (non-Javadoc)     * @see org.archive.util.Reporter#singleLineReport()     */    public String singleLineReport() {        return ArchiveUtils.singleLineReport(this);    }    /* (non-Javadoc)     * @see org.archive.util.Reporter#reportTo(java.io.Writer)     */    public void reportTo(PrintWriter writer) throws IOException {        reportTo(null,writer);    }        /* (non-Javadoc)     * @see org.archive.crawler.framework.Frontier#oneLineReport()     */    public synchronized void singleLineReportTo(PrintWriter w) throws IOException {        hostQueues.singleLineReportTo(w);    }    /* (non-Javadoc)     * @see org.archive.util.Reporter#singleLineLegend()     */    public String singleLineLegend() {        return hostQueues.singleLineLegend();    }        /* (non-Javadoc)     * @see org.archive.crawler.framework.Frontier#report()     */    public synchronized void reportTo(String name, PrintWriter writer) {        // ignore name; only one report for now        hostQueues.reportTo(name, writer);    }    /* (non-Javadoc)     * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)     */    public void crawlStarted(String message) {        // Not interested    }    /* (non-Javadoc)     * @see org.archive.crawler.event.CrawlStatusListener#crawlEnding(java.lang.String)     */    public void crawlEnding(String sExitMessage) {        // Not interested    }    /* (non-Javadoc)     * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)     */    public void crawlEnded(String sExitMessage) {        // Cleanup!        if (this.alreadyIncluded != null) {            this.alreadyIncluded.close();            this.alreadyIncluded = null;        }        hostQueues.close();    }    /* (non-Javadoc)     * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)     */    public void crawlPausing(String statusMessage) {        // Not interested    }    /* (non-Javadoc)     * @see org.archive.crawler.event.CrawlStatusListener#crawlPaused(java.lang.String)     */    public void crawlPaused(String statusMessage) {        // Not interested    }    /* (non-Javadoc)     * @see org.archive.crawler.event.CrawlStatusListener#crawlResuming(java.lang.String)     */    public void crawlResuming(String statusMessage) {        // Not interested    }    /* (non-Javadoc)     * @see org.archive.crawler.event.CrawlStatusListener#crawlCheckpoint(java.io.File)     */    public void crawlCheckpoint(File checkpointDir) throws Exception {        // Not interested    }    /* (non-Javadoc)     * @see org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver#receive(org.archive.crawler.datamodel.CandidateURI)     */    public void receive(CandidateURI item) {        System.out.println("Received " + item);        innerSchedule(item);            }    /* (non-Javadoc)     * @see org.archive.crawler.framework.Frontier#getGroup(org.archive.crawler.datamodel.CrawlURI)     */    public FrontierGroup getGroup(CrawlURI curi) {        try {            return getHQ(curi);        } catch (IOException ioe) {            throw new RuntimeException(ioe);        }    }        public long averageDepth() {        return hostQueues.getAverageDepth();    }        public float congestionRatio() {        return hostQueues.getCongestionRatio();    }        public long deepestUri() {        return hostQueues.getDeepestQueueSize();    }}
上一页 1 2 34
💿 文件大小 9430 K
👤 上传用户 zergwyk
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#lucece
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -