📄 abstractfrontier.java
字号:
* @param curi * The CrawlURI * @return millisecond politeness delay */ protected long politenessDelayFor(CrawlURI curi) { long durationToWait = 0; if (curi.containsKey(A_FETCH_BEGAN_TIME) && curi.containsKey(A_FETCH_COMPLETED_TIME)) { long completeTime = curi.getLong(A_FETCH_COMPLETED_TIME); long durationTaken = (completeTime - curi .getLong(A_FETCH_BEGAN_TIME)); durationToWait = (long)(((Float)getUncheckedAttribute(curi, ATTR_DELAY_FACTOR)).floatValue() * durationTaken); long minDelay = ((Integer)getUncheckedAttribute(curi, ATTR_MIN_DELAY)).longValue(); if (minDelay > durationToWait) { // wait at least the minimum durationToWait = minDelay; } long maxDelay = ((Integer)getUncheckedAttribute(curi, ATTR_MAX_DELAY)).longValue(); if (durationToWait > maxDelay) { // wait no more than the maximum durationToWait = maxDelay; } long now = System.currentTimeMillis(); int maxBandwidthKB = ((Integer)getUncheckedAttribute(curi, ATTR_MAX_HOST_BANDWIDTH_USAGE)).intValue(); if (maxBandwidthKB > 0) { // Enforce bandwidth limit CrawlHost host = controller.getServerCache().getHostFor(curi); long minDurationToWait = host.getEarliestNextURIEmitTime() - now; float maxBandwidth = maxBandwidthKB * 1.024F; // kilo factor long processedBytes = curi.getContentSize(); host .setEarliestNextURIEmitTime((long)(processedBytes / maxBandwidth) + now); if (minDurationToWait > durationToWait) { durationToWait = minDurationToWait; } } } return durationToWait; } /** * Ensure that any overall-bandwidth-usage limit is respected, by pausing as * long as necessary. * * @param now * @throws InterruptedException */ private void enforceBandwidthThrottle(long now) throws InterruptedException { int maxBandwidthKB = ((Integer)getUncheckedAttribute(null, ATTR_MAX_OVERALL_BANDWIDTH_USAGE)).intValue(); if (maxBandwidthKB > 0) { // Make sure that new bandwidth setting doesn't affect total crawl if (maxBandwidthKB != lastMaxBandwidthKB) { lastMaxBandwidthKB = maxBandwidthKB; processedBytesAfterLastEmittedURI = totalProcessedBytes; } // Enforce bandwidth limit long sleepTime = nextURIEmitTime - now; float maxBandwidth = maxBandwidthKB * 1.024F; // Kilo_factor long processedBytes = totalProcessedBytes - processedBytesAfterLastEmittedURI; long shouldHaveEmittedDiff = nextURIEmitTime == 0? 0 : nextURIEmitTime - now; nextURIEmitTime = (long)(processedBytes / maxBandwidth) + now + shouldHaveEmittedDiff; processedBytesAfterLastEmittedURI = totalProcessedBytes; if (sleepTime > 0) { long targetTime = now + sleepTime; now = System.currentTimeMillis(); while (now < targetTime) { synchronized (this) { if (logger.isLoggable(Level.FINE)) { logger.fine("Frontier waits for: " + sleepTime + "ms to respect bandwidth limit."); } // TODO: now that this is a wait(), frontier can // still schedule and finish items while waiting, // which is good, but multiple threads could all // wait for the same wakeTime, which somewhat // spoils the throttle... should be fixed. wait(targetTime - now); } now = System.currentTimeMillis(); } } } } /** * Take note of any processor-local errors that have been entered into the * CrawlURI. * * @param curi * */ protected void logLocalizedErrors(CrawlURI curi) { if (curi.containsKey(A_LOCALIZED_ERRORS)) { List localErrors = (List)curi.getObject(A_LOCALIZED_ERRORS); Iterator iter = localErrors.iterator(); while (iter.hasNext()) { Object array[] = {curi, iter.next()}; controller.localErrors.log(Level.WARNING, curi.getUURI() .toString(), array); } // once logged, discard curi.remove(A_LOCALIZED_ERRORS); } } /** * Utility method to return a scratch dir for the given key's temp files. * Every key gets its own subdir. To avoid having any one directory with * thousands of files, there are also two levels of enclosing directory * named by the least-significant hex digits of the key string's java * hashcode. * * @param key * @return File representing scratch directory */ protected File scratchDirFor(String key) { String hex = Integer.toHexString(key.hashCode()); while (hex.length() < 4) { hex = "0" + hex; } int len = hex.length(); return new File(this.controller.getStateDisk(), hex.substring(len - 2, len) + File.separator + hex.substring(len - 4, len - 2) + File.separator + key); } protected boolean overMaxRetries(CrawlURI curi) { // never retry more than the max number of times if (curi.getFetchAttempts() >= ((Integer)getUncheckedAttribute(curi, ATTR_MAX_RETRIES)).intValue()) { return true; } return false; } public void importRecoverLog(String pathToLog, boolean retainFailures) throws IOException { File source = new File(pathToLog); if (!source.isAbsolute()) { source = new File(getSettingsHandler().getOrder().getController() .getDisk(), pathToLog); } RecoveryJournal.importRecoverLog(source, this, retainFailures); } /* * (non-Javadoc) * * @see org.archive.crawler.framework.URIFrontier#kickUpdate() */ public void kickUpdate() { // by default, do nothing // (scope will loadSeeds, if appropriate) } /** * Log to the main crawl.log * * @param curi */ protected void log(CrawlURI curi) { curi.aboutToLog(); Object array[] = {curi}; this.controller.uriProcessing.log(Level.INFO, curi.getUURI().toString(), array); } protected boolean isDisregarded(CrawlURI curi) { switch (curi.getFetchStatus()) { case S_ROBOTS_PRECLUDED: // they don't want us to have it case S_BLOCKED_BY_CUSTOM_PROCESSOR: case S_OUT_OF_SCOPE: // filtered out by scope case S_BLOCKED_BY_USER: // filtered out by user case S_TOO_MANY_EMBED_HOPS: // too far from last true link case S_TOO_MANY_LINK_HOPS: // too far from seeds case S_DELETED_BY_USER: // user deleted return true; default: return false; } } /** * Checks if a recently completed CrawlURI that did not finish successfully * needs to be retried (processed again after some time elapses) * * @param curi * The CrawlURI to check * @return True if we need to retry. */ protected boolean needsRetrying(CrawlURI curi) { if (overMaxRetries(curi)) { return false; } switch (curi.getFetchStatus()) { case HttpStatus.SC_UNAUTHORIZED: // We can get here though usually a positive status code is // a success. We get here if there is rfc2617 credential data // loaded and we're supposed to go around again. See if any // rfc2617 credential present and if there, assume it got // loaded in FetchHTTP on expectation that we're to go around // again. If no rfc2617 loaded, we should not be here. boolean loaded = curi.hasRfc2617CredentialAvatar(); if (!loaded && logger.isLoggable(Level.INFO)) { logger.info("Have 401 but no creds loaded " + curi); } return loaded; case S_DEFERRED: case S_CONNECT_FAILED: case S_CONNECT_LOST: case S_DOMAIN_UNRESOLVABLE: // these are all worth a retry // TODO: consider if any others (S_TIMEOUT in some cases?) deserve // retry return true; default: return false; } } /** * Canonicalize passed uuri. Its would be sweeter if this canonicalize * function was encapsulated by that which it canonicalizes but because * settings change with context -- i.e. there may be overrides in operation * for a particular URI -- its not so easy; Each CandidateURI would need a * reference to the settings system. That's awkward to pass in. * * @param uuri Candidate URI to canonicalize. * @return Canonicalized version of passed <code>uuri</code>. */ protected String canonicalize(UURI uuri) { return Canonicalizer.canonicalize(uuri, this.controller.getOrder()); } /** * Canonicalize passed CandidateURI. This method differs from * {@link #canonicalize(UURI)} in that it takes a look at * the CandidateURI context possibly overriding any canonicalization effect if * it could make us miss content. If canonicalization produces an URL that * was 'alreadyseen', but the entry in the 'alreadyseen' database did * nothing but redirect to the current URL, we won't get the current URL; * we'll think we've already see it. Examples would be archive.org * redirecting to www.archive.org or the inverse, www.netarkivet.net * redirecting to netarkivet.net (assuming stripWWW rule enabled). * <p>Note, this method under circumstance sets the forceFetch flag. * * @param cauri CandidateURI to examine. * @return Canonicalized <code>cacuri</code>. */ protected String canonicalize(CandidateURI cauri) { String canon = canonicalize(cauri.getUURI()); if (cauri.isLocation()) { // If the via is not the same as where we're being redirected (i.e. // we're not being redirected back to the same page, AND the // canonicalization of the via is equal to the the current cauri, // THEN forcefetch (Forcefetch so no chance of our not crawling // content because alreadyseen check things its seen the url before. // An example of an URL that redirects to itself is: // http://bridalelegance.com/images/buttons3/tuxedos-off.gif. // An example of an URL whose canonicalization equals its via's // canonicalization, and we want to fetch content at the // redirection (i.e. need to set forcefetch), is netarkivet.dk. if (!cauri.toString().equals(cauri.getVia().toString()) && canonicalize(cauri.getVia()).equals(canon)) { cauri.setForceFetch(true); } } return canon; } /** * @param cauri CrawlURI we're to get a key for. * @return a String token representing a queue */ public String getClassKey(CandidateURI cauri) { String queueKey = (String)getUncheckedAttribute(cauri, ATTR_FORCE_QUEUE); if ("".equals(queueKey)) { // Typical case, barring overrides queueKey = queueAssignmentPolicy.getClassKey(this.controller, cauri); } return queueKey; } /** * @return RecoveryJournal instance. May be null. */ public FrontierJournal getFrontierJournal() { return this.recover; } public void crawlEnding(String sExitMessage) { // TODO Auto-generated method stub } public void crawlEnded(String sExitMessage) { if (logger.isLoggable(Level.INFO)) { logger.info("Closing with " + Long.toString(queuedUriCount()) + " urls still in queue."); } } public void crawlStarted(String message) { // TODO Auto-generated method stub } public void crawlPausing(String statusMessage) { // TODO Auto-generated method stub } public void crawlPaused(String statusMessage) { // TODO Auto-generated method stub } public void crawlResuming(String statusMessage) { // TODO Auto-generated method stub } public void crawlCheckpoint(File checkpointDir) throws Exception { if (this.recover == null) { return; } this.recover.checkpoint(checkpointDir); } // // Reporter implementation // public String singleLineReport() { return ArchiveUtils.singleLineReport(this); } public void reportTo(PrintWriter writer) { reportTo(null, writer); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -