⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bdbfrontier.java

📁 高性能分词算法
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
            // Assume its BdbUriUniqFilter.            uuf = this.controller.isCheckpointRecover()?                deserializeAlreadySeen(BdbUriUniqFilter.class,                    this.controller.getCheckpointRecover().getDirectory()):                new BdbUriUniqFilter(this.controller.getBdbEnvironment());            if (this.controller.isCheckpointRecover()) {                // If recover, need to call reopen of the db.                try {                    ((BdbUriUniqFilter)uuf).                        reopen(this.controller.getBdbEnvironment());                } catch (DatabaseException e) {                    throw new IOException(e.getMessage());                }            }           }        uuf.setDestination(this);        return uuf;    }        protected UriUniqFilter deserializeAlreadySeen(            final Class<? extends UriUniqFilter> cls,            final File dir)    throws FileNotFoundException, IOException {        UriUniqFilter uuf = null;        try {            logger.fine("Started deserializing " + cls.getName() +                " of checkpoint recover.");            uuf = CheckpointUtils.readObjectFromFile(cls, dir);            logger.fine("Finished deserializing bdbje as part " +                "of checkpoint recover.");        } catch (ClassNotFoundException e) {            throw new IOException("Failed to deserialize "  +                cls.getName() + ": " + e.getMessage());        }        return uuf;    }    /**     * Return the work queue for the given CrawlURI's classKey. URIs     * are ordered and politeness-delayed within their 'class'.     *      * @param curi CrawlURI to base queue on     * @return the found or created BdbWorkQueue     */    protected WorkQueue getQueueFor(CrawlURI curi) {        WorkQueue wq;        String classKey = curi.getClassKey();        synchronized (allQueues) {            wq = (WorkQueue)allQueues.get(classKey);            if (wq == null) {                wq = new BdbWorkQueue(classKey, this);                wq.setTotalBudget(((Long)getUncheckedAttribute(                    curi,ATTR_QUEUE_TOTAL_BUDGET)).longValue());                allQueues.put(classKey, wq);            }        }        return wq;    }        /**     * Return the work queue for the given classKey, or null     * if no such queue exists.     *      * @param classKey key to look for     * @return the found WorkQueue     */    protected WorkQueue getQueueFor(String classKey) {        WorkQueue wq;         synchronized (allQueues) {            wq = (WorkQueue)allQueues.get(classKey);        }        return wq;    }    public FrontierMarker getInitialMarker(String regexpr,            boolean inCacheOnly) {        return pendingUris.getInitialMarker(regexpr);    }    /**     * Return list of urls.     * @param marker     * @param numberOfMatches     * @param verbose      * @return List of URIs (strings).     */    public ArrayList<String> getURIsList(FrontierMarker marker,             int numberOfMatches, final boolean verbose) {        List curis;        try {            curis = pendingUris.getFrom(marker, numberOfMatches);        } catch (DatabaseException e) {            e.printStackTrace();            throw new RuntimeException(e);        }        ArrayList<String> results = new ArrayList<String>(curis.size());        Iterator iter = curis.iterator();        while(iter.hasNext()) {            CrawlURI curi = (CrawlURI) iter.next();            results.add("["+curi.getClassKey()+"] "+curi.singleLineReport());        }        return results;    }        protected void initQueue() throws IOException {        try {            this.pendingUris = createMultipleWorkQueues();        } catch(DatabaseException e) {            throw (IOException)new IOException(e.getMessage()).initCause(e);        }    }        protected void closeQueue() {        if((Boolean)getUncheckedAttribute(null,ATTR_DUMP_PENDING_AT_CLOSE)) {            try {                dumpAllPendingToLog();            } catch (DatabaseException e) {                logger.log(Level.WARNING,"dump pending problem",e);            }        }        if (this.pendingUris != null) {            this.pendingUris.close();            this.pendingUris = null;        }    }            protected BdbMultipleWorkQueues getWorkQueues() {        return pendingUris;    }    protected boolean workQueueDataOnDisk() {        return true;    }        public void initialize(CrawlController c)    throws FatalConfigurationException, IOException {        this.controller = c;        // fill in anything from a checkpoint recovery first (because        // usual initialization will skip initQueueOfQueues in checkpoint)        if (c.isCheckpointRecover()) {            // If a checkpoint recover, copy old values from serialized            // instance into this Frontier instance. Do it this way because             // though its possible to serialize BdbFrontier, its currently not            // possible to set/remove frontier attribute plugging the            // deserialized object back into the settings system.            // The below copying over is error-prone because its easy            // to miss a value.  Perhaps there's a better way?  Introspection?            BdbFrontier f = null;            try {                f = (BdbFrontier)CheckpointUtils.                    readObjectFromFile(this.getClass(),                        c.getCheckpointRecover().getDirectory());            } catch (FileNotFoundException e) {                throw new FatalConfigurationException("Failed checkpoint " +                    "recover: " + e.getMessage());            } catch (IOException e) {                throw new FatalConfigurationException("Failed checkpoint " +                    "recover: " + e.getMessage());            } catch (ClassNotFoundException e) {                throw new FatalConfigurationException("Failed checkpoint " +                    "recover: " + e.getMessage());            }            this.nextOrdinal = f.nextOrdinal;            this.totalProcessedBytes = f.totalProcessedBytes;            this.liveDisregardedUriCount = f.liveDisregardedUriCount;            this.liveFailedFetchCount = f.liveFailedFetchCount;            this.processedBytesAfterLastEmittedURI =                f.processedBytesAfterLastEmittedURI;            this.liveQueuedUriCount = f.liveQueuedUriCount;            this.liveSucceededFetchCount = f.liveSucceededFetchCount;            this.lastMaxBandwidthKB = f.lastMaxBandwidthKB;            this.readyClassQueues = f.readyClassQueues;            this.inactiveQueues = reinit(f.inactiveQueues,"inactiveQueues");            this.retiredQueues = reinit(f.retiredQueues,"retiredQueues");            this.snoozedClassQueues = f.snoozedClassQueues;            this.inProcessQueues = f.inProcessQueues;            super.initialize(c);            wakeQueues();        } else {            // perform usual initialization             super.initialize(c);        }    }            @Override    public void crawlEnded(String sExitMessage) {        ((StoredQueue)inactiveQueues).close();        ((StoredQueue)retiredQueues).close();        super.crawlEnded(sExitMessage);    }    public void crawlCheckpoint(File checkpointDir) throws Exception {        super.crawlCheckpoint(checkpointDir);        logger.fine("Started serializing already seen as part "            + "of checkpoint. Can take some time.");        // An explicit sync on the any deferred write dbs is needed to make the        // db recoverable. Sync'ing the environment doesn't work.        if (this.pendingUris != null) {        	this.pendingUris.sync();        }        CheckpointUtils.writeObjectToFile(this.alreadyIncluded, checkpointDir);        logger.fine("Finished serializing already seen as part "            + "of checkpoint.");        // Serialize ourselves.        CheckpointUtils.writeObjectToFile(this, checkpointDir);    }        /**     * Dump all still-enqueued URIs to the crawl.log -- without actually     * dequeuing. Useful for understanding what was remaining in a     * crawl that was ended early, for example at a time limit.      *      * @throws DatabaseException     */    public void dumpAllPendingToLog() throws DatabaseException {        Closure tolog = new Closure() {            public void execute(Object curi) {                log((CrawlURI)curi);            }};        pendingUris.forAllPendingDo(tolog);    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -