⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 abstractfrontier.java

📁 高性能分词算法
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
//            final Class qapClass = Class.forName((String)getUncheckedAttribute(//                    null, ATTR_QUEUE_ASSIGNMENT_POLICY));////            queueAssignmentPolicy =//                (QueueAssignmentPolicy)qapClass.newInstance();//        } catch (Exception e) {//            logger.log(Level.SEVERE, "Bad queue assignment policy class", e);//            throw new FatalConfigurationException(e.getMessage());//        }    }    synchronized public void terminate() {        shouldTerminate = true;        if (this.recover != null) {            this.recover.close();            this.recover = null;        }        unpause();    }    /**     * Report CrawlURI to each of the three 'substats' accumulators     * (group/queue, server, host) for a given stage.      *      * @param curi     * @param stage     */    protected void tally(CrawlURI curi, Stage stage) {        // Tally per-server, per-host, per-frontier-class running totals        CrawlServer server =            controller.getServerCache().getServerFor(curi);        if (server != null) {            server.getSubstats().tally(curi, stage);        }        CrawlHost host =             controller.getServerCache().getHostFor(curi);        if (host != null) {            host.getSubstats().tally(curi, stage);        }         FrontierGroup group =             controller.getFrontier().getGroup(curi);        group.getSubstats().tally(curi, stage);    }        protected void doJournalFinishedSuccess(CrawlURI c) {        tally(c,CrawlSubstats.Stage.SUCCEEDED);        if (this.recover != null) {            this.recover.finishedSuccess(c);        }    }    protected void doJournalAdded(CrawlURI c) {        tally(c,CrawlSubstats.Stage.SCHEDULED);        if (this.recover != null) {            this.recover.added(c);        }    }    protected void doJournalRescheduled(CrawlURI c) {        tally(c,CrawlSubstats.Stage.RETRIED);        if (this.recover != null) {            this.recover.rescheduled(c);        }    }    protected void doJournalFinishedFailure(CrawlURI c) {        tally(c,CrawlSubstats.Stage.FAILED);        if (this.recover != null) {            this.recover.finishedFailure(c);        }    }        protected void doJournalDisregarded(CrawlURI c) {        tally(c,CrawlSubstats.Stage.DISREGARDED);        if (this.recover != null) {            this.recover.finishedDisregard(c);        }    }    protected void doJournalEmitted(CrawlURI c) {        if (this.recover != null) {            this.recover.emitted(c);        }    }    /**     * Frontier is empty only if all queues are empty and no URIs are in-process     *      * @return True if queues are empty.     */    public boolean isEmpty() {        return liveQueuedUriCount.get() == 0;    }    /**     * Increment the running count of queued URIs.      */    protected void incrementQueuedUriCount() {        liveQueuedUriCount.incrementAndGet();    }    /**     * Increment the running count of queued URIs. Synchronized because     * operations on longs are not atomic.     *      * @param increment     *            amount to increment the queued count     */    protected void incrementQueuedUriCount(long increment) {        liveQueuedUriCount.addAndGet(increment);    }    /**     * Note that a number of queued Uris have been deleted.     *      * @param numberOfDeletes     */    protected void decrementQueuedCount(long numberOfDeletes) {        liveQueuedUriCount.addAndGet(-numberOfDeletes);    }    /**     * (non-Javadoc)     *      * @see org.archive.crawler.framework.Frontier#queuedUriCount()     */    public long queuedUriCount() {        return liveQueuedUriCount.get();    }    /**     * (non-Javadoc)     *      * @see org.archive.crawler.framework.Frontier#finishedUriCount()     */    public long finishedUriCount() {        return liveSucceededFetchCount.get() + liveFailedFetchCount.get() + liveDisregardedUriCount.get();    }    /**     * Increment the running count of successfully fetched URIs.      */    protected void incrementSucceededFetchCount() {        liveSucceededFetchCount.incrementAndGet();    }    /**     * (non-Javadoc)     *      * @see org.archive.crawler.framework.Frontier#succeededFetchCount()     */    public long succeededFetchCount() {        return liveSucceededFetchCount.get();    }    /**     * Increment the running count of failed URIs.      */    protected void incrementFailedFetchCount() {        liveFailedFetchCount.incrementAndGet();    }    /**     * (non-Javadoc)     *      * @see org.archive.crawler.framework.Frontier#failedFetchCount()     */    public long failedFetchCount() {        return liveFailedFetchCount.get();    }    /**     * Increment the running count of disregarded URIs. Synchronized because     * operations on longs are not atomic.     */    protected void incrementDisregardedUriCount() {        liveDisregardedUriCount.incrementAndGet();    }    public long disregardedUriCount() {        return liveDisregardedUriCount.get();    }    /** @deprecated misnomer; use StatisticsTracking figures instead */    public long totalBytesWritten() {        return totalProcessedBytes;    }    /**     * Load up the seeds.     *      * This method is called on initialize and inside in the crawlcontroller     * when it wants to force reloading of configuration.     *      * @see org.archive.crawler.framework.CrawlController#kickUpdate()     */    public void loadSeeds() {        Writer ignoredWriter = new StringWriter();        logger.info("beginning");        // Get the seeds to refresh.        Iterator iter = this.controller.getScope().seedsIterator(ignoredWriter);        int count = 0;         while (iter.hasNext()) {            UURI u = (UURI)iter.next();            CandidateURI caUri = CandidateURI.createSeedCandidateURI(u);            caUri.setSchedulingDirective(CandidateURI.MEDIUM);            if (((Boolean)getUncheckedAttribute(null, ATTR_SOURCE_TAG_SEEDS))                    .booleanValue()) {                caUri.putString(CoreAttributeConstants.A_SOURCE_TAG,caUri.toString());                caUri.makeHeritable(CoreAttributeConstants.A_SOURCE_TAG);            }            schedule(caUri);            count++;            if(count%1000==0) {                logger.info(count+" seeds");            }        }        // save ignored items (if any) where they can be consulted later        saveIgnoredItems(ignoredWriter.toString(), controller.getDisk());        logger.info("finished");    }    /**     * Dump ignored seed items (if any) to disk; delete file otherwise.     * Static to allow non-derived sibling classes (frontiers not yet      * subclassed here) to reuse.     *      * @param ignoredItems     * @param dir      */    public static void saveIgnoredItems(String ignoredItems, File dir) {        File ignoredFile = new File(dir, IGNORED_SEEDS_FILENAME);        if(ignoredItems==null | ignoredItems.length()>0) {            try {                BufferedWriter bw = new BufferedWriter(new FileWriter(ignoredFile));                bw.write(ignoredItems);                bw.close();            } catch (IOException e) {                // TODO make an alert?                e.printStackTrace();            }        } else {            // delete any older file (if any)            ignoredFile.delete();        }    }    protected CrawlURI asCrawlUri(CandidateURI caUri) {        CrawlURI curi;        if (caUri instanceof CrawlURI) {            curi = (CrawlURI)caUri;        } else {            curi = CrawlURI.from(caUri, nextOrdinal.getAndIncrement());        }        curi.setClassKey(getClassKey(curi));        return curi;    }    /**     * @param now     * @throws InterruptedException     * @throws EndedException     */    protected synchronized void preNext(long now) throws InterruptedException,            EndedException {        if (this.controller == null) {            return;        }                // Check completion conditions        if (this.controller.atFinish()) {            if (((Boolean)getUncheckedAttribute(null, ATTR_PAUSE_AT_FINISH))                    .booleanValue()) {                this.controller.requestCrawlPause();            } else {                this.controller.beginCrawlStop();            }        }        // enforce operator pause        if (shouldPause) {            while (shouldPause) {                this.controller.toePaused();                wait();            }            // exitted pause; possibly finish regardless of pause-at-finish            if (controller != null && controller.atFinish()) {                this.controller.beginCrawlStop();            }        }        // enforce operator terminate or thread retirement        if (shouldTerminate                || ((ToeThread)Thread.currentThread()).shouldRetire()) {            throw new EndedException("terminated");        }        enforceBandwidthThrottle(now);    }    /**     * Perform any special handling of the CrawlURI, such as promoting its URI     * to seed-status, or preferencing it because it is an embed.     *      * @param curi     */    protected void applySpecialHandling(CrawlURI curi) {        if (curi.isSeed() && curi.getVia() != null                && curi.flattenVia().length() > 0) {            // The only way a seed can have a non-empty via is if it is the            // result of a seed redirect. Add it to the seeds list.            //            // This is a feature. This is handling for case where a seed            // gets immediately redirected to another page. What we're doing is            // treating the immediate redirect target as a seed.            this.controller.getScope().addSeed(curi);            // And it needs rapid scheduling.	    if (curi.getSchedulingDirective() == CandidateURI.NORMAL)                curi.setSchedulingDirective(CandidateURI.MEDIUM);        }        // optionally preferencing embeds up to MEDIUM        int prefHops = ((Integer)getUncheckedAttribute(curi,                ATTR_PREFERENCE_EMBED_HOPS)).intValue();        if (prefHops > 0) {            int embedHops = curi.getTransHops();            if (embedHops > 0 && embedHops <= prefHops                    && curi.getSchedulingDirective() == CandidateURI.NORMAL) {                // number of embed hops falls within the preferenced range, and                // uri is not already MEDIUM -- so promote it                curi.setSchedulingDirective(CandidateURI.MEDIUM);            }        }    }    /**     * Perform fixups on a CrawlURI about to be returned via next().     *      * @param curi     *            CrawlURI about to be returned by next()     * @param q     *            the queue from which the CrawlURI came     */    protected void noteAboutToEmit(CrawlURI curi, WorkQueue q) {        curi.setHolder(q);        // if (curi.getServer() == null) {        //    // TODO: perhaps short-circuit the emit here,        //    // because URI will be rejected as unfetchable        // }        doJournalEmitted(curi);    }    /**     * @param curi     * @return the CrawlServer to be associated with this CrawlURI     */    protected CrawlServer getServer(CrawlURI curi) {        return this.controller.getServerCache().getServerFor(curi);    }    /**     * Return a suitable value to wait before retrying the given URI.     *      * @param curi     *            CrawlURI to be retried     * @return millisecond delay before retry     */    protected long retryDelayFor(CrawlURI curi) {        int status = curi.getFetchStatus();        return (status == S_CONNECT_FAILED || status == S_CONNECT_LOST ||                status == S_DOMAIN_UNRESOLVABLE)?            ((Long)getUncheckedAttribute(curi, ATTR_RETRY_DELAY)).longValue():            0; // no delay for most    }    /**     * Update any scheduling structures with the new information in this     * CrawlURI. Chiefly means make necessary arrangements for no other URIs at     * the same host to be visited within the appropriate politeness window.     *      * @param curi     *            The CrawlURI     * @return millisecond politeness delay     */    protected long politenessDelayFor(CrawlURI curi) {        long durationToWait = 0;        if (curi.containsKey(A_FETCH_BEGAN_TIME)                && curi.containsKey(A_FETCH_COMPLETED_TIME)) {            long completeTime = curi.getLong(A_FETCH_COMPLETED_TIME);            long durationTaken = (completeTime - curi                    .getLong(A_FETCH_BEGAN_TIME));            durationToWait = (long)(((Float)getUncheckedAttribute(curi,                    ATTR_DELAY_FACTOR)).floatValue() * durationTaken);            long minDelay = ((Integer)getUncheckedAttribute(curi,                    ATTR_MIN_DELAY)).longValue();            

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -