📄 adaptiverevisitfrontier.java
字号:
curi.getUURI().toString(), array); } // once logged, discard curi.remove(A_LOCALIZED_ERRORS); } } /** * The CrawlURI has been successfully crawled. * * @param curi The CrawlURI */ protected void successDisposition(CrawlURI curi) { curi.aboutToLog(); long waitInterval = 0; if(curi.containsKey(A_WAIT_INTERVAL)){ waitInterval = curi.getLong(A_WAIT_INTERVAL); curi.addAnnotation("wt:" + ArchiveUtils.formatMillisecondsToConventional( waitInterval)); } else { logger.severe("Missing wait interval for " + curi.toString() + " WaitEvaluator may be missing."); } if(curi.containsKey(A_NUMBER_OF_VISITS)){ curi.addAnnotation(curi.getInt(A_NUMBER_OF_VISITS) + "vis"); } if(curi.containsKey(A_NUMBER_OF_VERSIONS)){ curi.addAnnotation(curi.getInt(A_NUMBER_OF_VERSIONS) + "ver"); } if(curi.containsKey(A_FETCH_OVERDUE)){ curi.addAnnotation("ov:" + ArchiveUtils.formatMillisecondsToConventional( (curi.getLong(A_FETCH_OVERDUE)))); } Object array[] = { curi }; controller.uriProcessing.log( Level.INFO, curi.getUURI().toString(), array); succeededFetchCount++; totalProcessedBytes += curi.getContentSize(); // Let everyone know in case they want to do something before we strip // the curi. controller.fireCrawledURISuccessfulEvent(curi); curi.setSchedulingDirective(CandidateURI.NORMAL); // Set time of next processing curi.putLong(A_TIME_OF_NEXT_PROCESSING, System.currentTimeMillis()+waitInterval); /* Update HQ */ AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey()); // Wake up time is based on the time when a fetch was completed + the // calculated snooze time for politeness. If the fetch completion time // is missing, we'll use current time. long wakeupTime = (curi.containsKey(A_FETCH_COMPLETED_TIME)? curi.getLong(A_FETCH_COMPLETED_TIME): (new Date()).getTime()) + calculateSnoozeTime(curi); // Ready the URI for reserialization. curi.processingCleanup(); curi.resetDeferrals(); curi.resetFetchAttempts(); try { hq.update(curi, true, wakeupTime); } catch (IOException e) { logger.severe("An IOException occured when updating " + curi.toString() + "\n" + e.getMessage()); e.printStackTrace(); } } /** * Put near top of relevant hostQueue (but behind anything recently * scheduled 'high').. * * @param curi CrawlURI to reschedule. Its time of next processing is not * modified. * @param errorWait signals if there should be a wait before retrying. * @throws AttributeNotFoundException */ protected void reschedule(CrawlURI curi, boolean errorWait) throws AttributeNotFoundException { long delay = 0; if(errorWait){ if(curi.containsKey(A_RETRY_DELAY)) { delay = curi.getLong(A_RETRY_DELAY); } else { // use ARFrontier default delay = ((Long)getAttribute(ATTR_RETRY_DELAY,curi)).longValue(); } } long retryTime = (curi.containsKey(A_FETCH_COMPLETED_TIME)? curi.getLong(A_FETCH_COMPLETED_TIME): (new Date()).getTime()) + delay; AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey()); // Ready the URI for reserialization. curi.processingCleanup(); if(errorWait){ curi.resetDeferrals(); //Defferals only refer to immediate retries. } try { hq.update(curi, errorWait, retryTime); } catch (IOException e) { // TODO Handle IOException e.printStackTrace(); } } /** * The CrawlURI has encountered a problem, and will not * be retried. * * @param curi The CrawlURI */ protected void failureDisposition(CrawlURI curi) { //Let interested listeners know of failed disposition. this.controller.fireCrawledURIFailureEvent(curi); // send to basic log curi.aboutToLog(); Object array[] = { curi }; this.controller.uriProcessing.log( Level.INFO, curi.getUURI().toString(), array); // if exception, also send to crawlErrors if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) { this.controller.runtimeErrors.log( Level.WARNING, curi.getUURI().toString(), array); } failedFetchCount++; // Put the failed URI at the very back of the queue. curi.setSchedulingDirective(CandidateURI.NORMAL); // TODO: reconsider this curi.putLong(A_TIME_OF_NEXT_PROCESSING,Long.MAX_VALUE); AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey()); // Ready the URI for serialization. curi.processingCleanup(); curi.resetDeferrals(); curi.resetFetchAttempts(); try { // No wait on failure. No contact was made with the server. boolean shouldForget = shouldBeForgotten(curi); if(shouldForget && alreadyIncluded != null){ alreadyIncluded.forget(canonicalize(curi.getUURI()),curi); } hq.update(curi,false, 0, shouldForget); } catch (IOException e) { // TODO Handle IOException e.printStackTrace(); } } protected void disregardDisposition(CrawlURI curi) { //Let interested listeners know of disregard disposition. controller.fireCrawledURIDisregardEvent(curi); // send to basic log curi.aboutToLog(); Object array[] = { curi }; controller.uriProcessing.log( Level.INFO, curi.getUURI().toString(), array); disregardedUriCount++; // Todo: consider timout before retrying disregarded elements. // Possibly add a setting to the WaitEvaluators? curi.putLong(A_TIME_OF_NEXT_PROCESSING,Long.MAX_VALUE); curi.setSchedulingDirective(CandidateURI.NORMAL); AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey()); // Ready the URI for reserialization. curi.processingCleanup(); curi.resetDeferrals(); curi.resetFetchAttempts(); try { // No politness wait on disregard. No contact was made with server hq.update(curi, false, 0, shouldBeForgotten(curi)); } catch (IOException e) { // TODO Handle IOException e.printStackTrace(); } } /** * Some URIs, if they recur, deserve another * chance at consideration: they might not be too * many hops away via another path, or the scope * may have been updated to allow them passage. * * @param curi * @return True if curi should be forgotten. */ protected boolean shouldBeForgotten(CrawlURI curi) { switch(curi.getFetchStatus()) { case S_OUT_OF_SCOPE: case S_TOO_MANY_EMBED_HOPS: case S_TOO_MANY_LINK_HOPS: return true; default: return false; } } /** * Checks if a recently completed CrawlURI that did not finish successfully * needs to be retried immediately (processed again as soon as politeness * allows.) * * @param curi The CrawlURI to check * @return True if we need to retry promptly. * @throws AttributeNotFoundException If problems occur trying to read the * maximum number of retries from the settings framework. */ protected boolean needsPromptRetry(CrawlURI curi) throws AttributeNotFoundException { if (curi.getFetchAttempts() >= ((Integer)getAttribute(ATTR_MAX_RETRIES, curi)).intValue() ) { return false; } switch (curi.getFetchStatus()) { case S_DEFERRED: return true; case HttpStatus.SC_UNAUTHORIZED: // We can get here though usually a positive status code is // a success. We get here if there is rfc2617 credential data // loaded and we're supposed to go around again. See if any // rfc2617 credential present and if there, assume it got // loaded in FetchHTTP on expectation that we're to go around // again. If no rfc2617 loaded, we should not be here. boolean loaded = curi.hasRfc2617CredentialAvatar(); if (!loaded) { logger.severe("Have 401 but no creds loaded " + curi); } return loaded; default: return false; } } /** * Checks if a recently completed CrawlURI that did not finish successfully * needs to be retried (processed again after some time elapses) * * @param curi The CrawlURI to check * @return True if we need to retry. * @throws AttributeNotFoundException If problems occur trying to read the * maximum number of retries from the settings framework. */ protected boolean needsRetrying(CrawlURI curi) throws AttributeNotFoundException { // Check to see if maximum number of retries has been exceeded. if (curi.getFetchAttempts() >= ((Integer)getAttribute(ATTR_MAX_RETRIES,curi)).intValue() ) { return false; } else { // Check if FetchStatus indicates that a delayed retry is needed. switch (curi.getFetchStatus()) { case S_CONNECT_FAILED: case S_CONNECT_LOST: case S_DOMAIN_UNRESOLVABLE: // these are all worth a retry // TODO: consider if any others (S_TIMEOUT in some cases?) // deserve retry return true; default: return false; } } } protected boolean isDisregarded(CrawlURI curi) { switch (curi.getFetchStatus()) { case S_ROBOTS_PRECLUDED : // they don't want us to have it case S_OUT_OF_SCOPE : // filtered out by scope case S_BLOCKED_BY_CUSTOM_PROCESSOR: case S_BLOCKED_BY_USER : // filtered out by user case S_TOO_MANY_EMBED_HOPS : // too far from last true link case S_TOO_MANY_LINK_HOPS : // too far from seeds case S_DELETED_BY_USER : // user deleted return true; default: return false; } } /** * Calculates how long a host queue needs to be snoozed following the * crawling of a URI. * * @param curi The CrawlURI * @return How long to snooze. */ protected long calculateSnoozeTime(CrawlURI curi) { long durationToWait = 0; if (curi.containsKey(A_FETCH_BEGAN_TIME) && curi.containsKey(A_FETCH_COMPLETED_TIME)) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -