📄 crawluri.java
字号:
* Increment the number of attempts at getting the document referenced by * this URI. * * @return the number of attempts at getting the document referenced by this * URI. */ public int incrementFetchAttempts() { // TODO: rename, this is actually processing-loop-attempts return fetchAttempts++; } /** * Reset fetchAttempts counter. */ public void resetFetchAttempts() { this.fetchAttempts = 0; } /** * Reset deferrals counter. */ public void resetDeferrals() { this.deferrals = 0; } /** * Get the next processor to process this URI. * * @return the processor that should process this URI next. */ public Processor nextProcessor() { return nextProcessor; } /** * Get the processor chain that should be processing this URI after the * current chain is finished with it. * * @return the next processor chain to process this URI. */ public ProcessorChain nextProcessorChain() { return nextProcessorChain; } /** * Set the next processor to process this URI. * * @param processor the next processor to process this URI. */ public void setNextProcessor(Processor processor) { nextProcessor = processor; } /** * Set the next processor chain to process this URI. * * @param nextProcessorChain the next processor chain to process this URI. */ public void setNextProcessorChain(ProcessorChain nextProcessorChain) { this.nextProcessorChain = nextProcessorChain; } /** * Do all actions associated with setting a <code>CrawlURI</code> as * requiring a prerequisite. * * @param lastProcessorChain Last processor chain reference. This chain is * where this <code>CrawlURI</code> goes next. * @param preq Object to set a prerequisite. * @throws URIException */ public void markPrerequisite(String preq, ProcessorChain lastProcessorChain) throws URIException { Link link = createLink(preq,Link.PREREQ_MISC,Link.PREREQ_HOP); setPrerequisiteUri(link); incrementDeferrals(); setFetchStatus(S_DEFERRED); skipToProcessorChain(lastProcessorChain); } /** * Set a prerequisite for this URI. * <p> * A prerequisite is a URI that must be crawled before this URI can be * crawled. * * @param link Link to set as prereq. */ public void setPrerequisiteUri(Object link) { putObject(A_PREREQUISITE_URI, link); } /** * Get the prerequisite for this URI. * <p> * A prerequisite is a URI that must be crawled before this URI can be * crawled. * * @return the prerequisite for this URI or null if no prerequisite. */ public Object getPrerequisiteUri() { return getObject(A_PREREQUISITE_URI); } /** * @return True if this CrawlURI has a prerequisite. */ public boolean hasPrerequisiteUri() { return containsKey(A_PREREQUISITE_URI); } /** * Returns true if this CrawlURI is a prerequisite. * * @return true if this CrawlURI is a prerequisite. */ public boolean isPrerequisite() { return this.prerequisite; } /** * Set if this CrawlURI is itself a prerequisite URI. * * @param prerequisite True if this CrawlURI is itself a prerequiste uri. */ public void setPrerequisite(boolean prerequisite) { this.prerequisite = prerequisite; } /** * @return This crawl URI as a string wrapped with 'CrawlURI(' + * ')'. */ public String getCrawlURIString() { if (this.cachedCrawlURIString == null) { synchronized (this) { if (this.cachedCrawlURIString == null) { this.cachedCrawlURIString = "CrawlURI(" + toString() + ")"; } } } return this.cachedCrawlURIString; } /** * Get the content type of this URI. * * @return Fetched URIs content type. May be null. */ public String getContentType() { return this.contentType; } /** * Set a fetched uri's content type. * * @param ct Contenttype. May be null. */ public void setContentType(String ct) { this.contentType = ct; } /** * Set the number of the ToeThread responsible for processing this uri. * * @param i the ToeThread number. */ public void setThreadNumber(int i) { threadNumber = i; } /** * Get the number of the ToeThread responsible for processing this uri. * * @return the ToeThread number. */ public int getThreadNumber() { return threadNumber; } /** * Increment the deferral count. * */ public void incrementDeferrals() { deferrals++; } /** * Get the deferral count. * * @return the deferral count. */ public int getDeferrals() { return deferrals; } /** * Remove all attributes set on this uri. * <p> * This methods removes the attribute list. */ public void stripToMinimal() { clearAList(); } /** * Get the size in bytes of this URI's recorded content, inclusive * of things like protocol headers. It is the responsibility of the * classes which fetch the URI to set this value accordingly -- it is * not calculated/verified within CrawlURI. * * This value is consulted in reporting/logging/writing-decisions. * * @see #setContentSize() * @return contentSize */ public long getContentSize(){ return contentSize; } /** * Make note of a non-fatal error, local to a particular Processor, * which should be logged somewhere, but allows processing to continue. * * This is how you add to the local-error log (the 'localized' in * the below is making an error local rather than global, not * making a swiss-french version of the error.). * * @param processorName Name of processor the exception was thrown * in. * @param ex Throwable to log. * @param message Extra message to log beyond exception message. */ public void addLocalizedError(final String processorName, final Throwable ex, final String message) { List<LocalizedError> localizedErrors; if (containsKey(A_LOCALIZED_ERRORS)) { @SuppressWarnings("unchecked") List<LocalizedError> temp // to prevent warning on cast = (List<LocalizedError>) getObject(A_LOCALIZED_ERRORS); localizedErrors = temp; } else { localizedErrors = new ArrayList<LocalizedError>(); putObject(A_LOCALIZED_ERRORS, localizedErrors); } localizedErrors.add(new LocalizedError(processorName, ex, message)); addAnnotation("le:" + getClassSimpleName(ex.getClass()) + "@" + processorName); } // TODO: Move to utils. protected String getClassSimpleName(final Class c) { String classname = c.getName(); int index = classname.lastIndexOf('.'); return ((index > 0 && (index + 1) < classname.length())? classname.substring(index + 1): classname); } /** * Add an annotation: an abbrieviated indication of something special * about this URI that need not be present in every crawl.log line, * but should be noted for future reference. * * @param annotation the annotation to add; should not contain * whitespace or a comma */ public void addAnnotation(String annotation) { String annotations; if(containsKey(A_ANNOTATIONS)) { annotations = getString(A_ANNOTATIONS); annotations += ","+annotation; } else { annotations = annotation; } putString(A_ANNOTATIONS,annotations); } /** * TODO: Implement truncation using booleans rather than as this * ugly String parse. * @return True if fetch was truncated. */ public boolean isTruncatedFetch() { return annotationContains(TRUNC_SUFFIX); } public boolean isLengthTruncatedFetch() { return annotationContains(LENGTH_TRUNC); } public boolean isTimeTruncatedFetch() { return annotationContains(TIMER_TRUNC); } public boolean isHeaderTruncatedFetch() { return annotationContains(HEADER_TRUNC); } protected boolean annotationContains(final String str2Find) { boolean result = false; if (!containsKey(A_ANNOTATIONS)) { return result; } String annotations = getString(A_ANNOTATIONS); if (annotations != null && annotations.length() > 0) { result = annotations.indexOf(str2Find) >= 0; } return result; } /** * Get the annotations set for this uri. * * @return the annotations set for this uri. */ public String getAnnotations() { return (containsKey(A_ANNOTATIONS))? getString(A_ANNOTATIONS): null; } /** * Get the embeded hop count. * * @return the embeded hop count. * @deprecated */ public int getEmbedHopCount() { return embedHopCount; } /** * Get the link hop count. * * @return the link hop count. * @deprecated */ public int getLinkHopCount() { return linkHopCount; } /** * Mark this uri as being a seed. * * * @deprecated */ public void markAsSeed() { linkHopCount = 0; embedHopCount = 0; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -