📄 abstractfrontier.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
     * @param curi     *            The CrawlURI     * @return millisecond politeness delay     */    protected long politenessDelayFor(CrawlURI curi) {        long durationToWait = 0;        if (curi.containsKey(A_FETCH_BEGAN_TIME)                && curi.containsKey(A_FETCH_COMPLETED_TIME)) {            long completeTime = curi.getLong(A_FETCH_COMPLETED_TIME);            long durationTaken = (completeTime - curi                    .getLong(A_FETCH_BEGAN_TIME));            durationToWait = (long)(((Float)getUncheckedAttribute(curi,                    ATTR_DELAY_FACTOR)).floatValue() * durationTaken);            long minDelay = ((Integer)getUncheckedAttribute(curi,                    ATTR_MIN_DELAY)).longValue();            if (minDelay > durationToWait) {                // wait at least the minimum                durationToWait = minDelay;            }            long maxDelay = ((Integer)getUncheckedAttribute(curi,                    ATTR_MAX_DELAY)).longValue();            if (durationToWait > maxDelay) {                // wait no more than the maximum                durationToWait = maxDelay;            }            long now = System.currentTimeMillis();            int maxBandwidthKB = ((Integer)getUncheckedAttribute(curi,                    ATTR_MAX_HOST_BANDWIDTH_USAGE)).intValue();            if (maxBandwidthKB > 0) {                // Enforce bandwidth limit                CrawlHost host = controller.getServerCache().getHostFor(curi);                long minDurationToWait = host.getEarliestNextURIEmitTime()                        - now;                float maxBandwidth = maxBandwidthKB * 1.024F; // kilo factor                long processedBytes = curi.getContentSize();                host                        .setEarliestNextURIEmitTime((long)(processedBytes / maxBandwidth)                                + now);                if (minDurationToWait > durationToWait) {                    durationToWait = minDurationToWait;                }            }        }        return durationToWait;    }    /**     * Ensure that any overall-bandwidth-usage limit is respected, by pausing as     * long as necessary.     *      * @param now     * @throws InterruptedException     */    private void enforceBandwidthThrottle(long now) throws InterruptedException {        int maxBandwidthKB = ((Integer)getUncheckedAttribute(null,                ATTR_MAX_OVERALL_BANDWIDTH_USAGE)).intValue();        if (maxBandwidthKB > 0) {            // Make sure that new bandwidth setting doesn't affect total crawl            if (maxBandwidthKB != lastMaxBandwidthKB) {                lastMaxBandwidthKB = maxBandwidthKB;                processedBytesAfterLastEmittedURI = totalProcessedBytes;            }            // Enforce bandwidth limit            long sleepTime = nextURIEmitTime - now;            float maxBandwidth = maxBandwidthKB * 1.024F; // Kilo_factor            long processedBytes = totalProcessedBytes                    - processedBytesAfterLastEmittedURI;            long shouldHaveEmittedDiff = nextURIEmitTime == 0? 0                    : nextURIEmitTime - now;            nextURIEmitTime = (long)(processedBytes / maxBandwidth) + now                    + shouldHaveEmittedDiff;            processedBytesAfterLastEmittedURI = totalProcessedBytes;            if (sleepTime > 0) {                long targetTime = now + sleepTime;                now = System.currentTimeMillis();                while (now < targetTime) {                    synchronized (this) {                        if (logger.isLoggable(Level.FINE)) {                            logger.fine("Frontier waits for: " + sleepTime                                    + "ms to respect bandwidth limit.");                        }                        // TODO: now that this is a wait(), frontier can                        // still schedule and finish items while waiting,                        // which is good, but multiple threads could all                        // wait for the same wakeTime, which somewhat                        // spoils the throttle... should be fixed.                        wait(targetTime - now);                    }                    now = System.currentTimeMillis();                }            }        }    }    /**     * Take note of any processor-local errors that have been entered into the     * CrawlURI.     *      * @param curi     *       */    protected void logLocalizedErrors(CrawlURI curi) {        if (curi.containsKey(A_LOCALIZED_ERRORS)) {            List localErrors = (List)curi.getObject(A_LOCALIZED_ERRORS);            Iterator iter = localErrors.iterator();            while (iter.hasNext()) {                Object array[] = {curi, iter.next()};                controller.localErrors.log(Level.WARNING, curi.getUURI()                        .toString(), array);            }            // once logged, discard            curi.remove(A_LOCALIZED_ERRORS);        }    }    /**     * Utility method to return a scratch dir for the given key's temp files.     * Every key gets its own subdir. To avoid having any one directory with     * thousands of files, there are also two levels of enclosing directory     * named by the least-significant hex digits of the key string's java     * hashcode.     *      * @param key     * @return File representing scratch directory     */    protected File scratchDirFor(String key) {        String hex = Integer.toHexString(key.hashCode());        while (hex.length() < 4) {            hex = "0" + hex;        }        int len = hex.length();        return new File(this.controller.getStateDisk(), hex.substring(len - 2,                len)                + File.separator                + hex.substring(len - 4, len - 2)                + File.separator + key);    }    protected boolean overMaxRetries(CrawlURI curi) {        // never retry more than the max number of times        if (curi.getFetchAttempts() >= ((Integer)getUncheckedAttribute(curi,                ATTR_MAX_RETRIES)).intValue()) {            return true;        }        return false;    }    public void importRecoverLog(String pathToLog, boolean retainFailures)            throws IOException {        File source = new File(pathToLog);        if (!source.isAbsolute()) {            source = new File(getSettingsHandler().getOrder().getController()                    .getDisk(), pathToLog);        }        RecoveryJournal.importRecoverLog(source, this, retainFailures);    }    /*     * (non-Javadoc)     *      * @see org.archive.crawler.framework.URIFrontier#kickUpdate()     */    public void kickUpdate() {        // by default, do nothing        // (scope will loadSeeds, if appropriate)    }    /**     * Log to the main crawl.log     *      * @param curi     */    protected void log(CrawlURI curi) {        curi.aboutToLog();        Object array[] = {curi};        this.controller.uriProcessing.log(Level.INFO,                curi.getUURI().toString(), array);    }    protected boolean isDisregarded(CrawlURI curi) {        switch (curi.getFetchStatus()) {        case S_ROBOTS_PRECLUDED: // they don't want us to have it        case S_BLOCKED_BY_CUSTOM_PROCESSOR:        case S_OUT_OF_SCOPE: // filtered out by scope        case S_BLOCKED_BY_USER: // filtered out by user        case S_TOO_MANY_EMBED_HOPS: // too far from last true link        case S_TOO_MANY_LINK_HOPS: // too far from seeds        case S_DELETED_BY_USER: // user deleted            return true;        default:            return false;        }    }    /**     * Checks if a recently completed CrawlURI that did not finish successfully     * needs to be retried (processed again after some time elapses)     *      * @param curi     *            The CrawlURI to check     * @return True if we need to retry.     */    protected boolean needsRetrying(CrawlURI curi) {        if (overMaxRetries(curi)) {            return false;        }        switch (curi.getFetchStatus()) {        case HttpStatus.SC_UNAUTHORIZED:            // We can get here though usually a positive status code is            // a success. We get here if there is rfc2617 credential data            // loaded and we're supposed to go around again. See if any            // rfc2617 credential present and if there, assume it got            // loaded in FetchHTTP on expectation that we're to go around            // again. If no rfc2617 loaded, we should not be here.            boolean loaded = curi.hasRfc2617CredentialAvatar();            if (!loaded && logger.isLoggable(Level.INFO)) {                logger.info("Have 401 but no creds loaded " + curi);            }            return loaded;        case S_DEFERRED:        case S_CONNECT_FAILED:        case S_CONNECT_LOST:        case S_DOMAIN_UNRESOLVABLE:            // these are all worth a retry            // TODO: consider if any others (S_TIMEOUT in some cases?) deserve            // retry            return true;        default:            return false;        }    }    /**     * Canonicalize passed uuri. Its would be sweeter if this canonicalize     * function was encapsulated by that which it canonicalizes but because     * settings change with context -- i.e. there may be overrides in operation     * for a particular URI -- its not so easy; Each CandidateURI would need a     * reference to the settings system. That's awkward to pass in.     *      * @param uuri Candidate URI to canonicalize.     * @return Canonicalized version of passed <code>uuri</code>.     */    protected String canonicalize(UURI uuri) {        return Canonicalizer.canonicalize(uuri, this.controller.getOrder());    }    /**     * Canonicalize passed CandidateURI. This method differs from     * {@link #canonicalize(UURI)} in that it takes a look at     * the CandidateURI context possibly overriding any canonicalization effect if     * it could make us miss content. If canonicalization produces an URL that     * was 'alreadyseen', but the entry in the 'alreadyseen' database did     * nothing but redirect to the current URL, we won't get the current URL;     * we'll think we've already see it. Examples would be archive.org     * redirecting to www.archive.org or the inverse, www.netarkivet.net     * redirecting to netarkivet.net (assuming stripWWW rule enabled).     * <p>Note, this method under circumstance sets the forceFetch flag.     *      * @param cauri CandidateURI to examine.     * @return Canonicalized <code>cacuri</code>.     */    protected String canonicalize(CandidateURI cauri) {        String canon = canonicalize(cauri.getUURI());        if (cauri.isLocation()) {            // If the via is not the same as where we're being redirected (i.e.            // we're not being redirected back to the same page, AND the            // canonicalization of the via is equal to the the current cauri,             // THEN forcefetch (Forcefetch so no chance of our not crawling            // content because alreadyseen check things its seen the url before.            // An example of an URL that redirects to itself is:            // http://bridalelegance.com/images/buttons3/tuxedos-off.gif.            // An example of an URL whose canonicalization equals its via's            // canonicalization, and we want to fetch content at the            // redirection (i.e. need to set forcefetch), is netarkivet.dk.            if (!cauri.toString().equals(cauri.getVia().toString()) &&                    canonicalize(cauri.getVia()).equals(canon)) {                cauri.setForceFetch(true);            }        }        return canon;    }    /**     * @param cauri CrawlURI we're to get a key for.     * @return a String token representing a queue     */    public String getClassKey(CandidateURI cauri) {        String queueKey = (String)getUncheckedAttribute(cauri,            ATTR_FORCE_QUEUE);        if ("".equals(queueKey)) {            // Typical case, barring overrides            queueKey =                queueAssignmentPolicy.getClassKey(this.controller, cauri);        }        return queueKey;    }    /**     * @return RecoveryJournal instance.  May be null.     */    public FrontierJournal getFrontierJournal() {        return this.recover;    }    public void crawlEnding(String sExitMessage) {        // TODO Auto-generated method stub    }    public void crawlEnded(String sExitMessage) {        if (logger.isLoggable(Level.INFO)) {            logger.info("Closing with " + Long.toString(queuedUriCount()) +                " urls still in queue.");        }    }    public void crawlStarted(String message) {        // TODO Auto-generated method stub    }    public void crawlPausing(String statusMessage) {        // TODO Auto-generated method stub    }    public void crawlPaused(String statusMessage) {        // TODO Auto-generated method stub    }    public void crawlResuming(String statusMessage) {        // TODO Auto-generated method stub    }        public void crawlCheckpoint(File checkpointDir)    throws Exception {        if (this.recover == null) {            return;        }        this.recover.checkpoint(checkpointDir);    }        //    // Reporter implementation    //     public String singleLineReport() {        return ArchiveUtils.singleLineReport(this);    }    public void reportTo(PrintWriter writer) {        reportTo(null, writer);    }}
上一页 1 23
💿 文件大小 9430 K
👤 上传用户 zergwyk
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#lucece
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -