📄 crawluri.java
字号:
public boolean isPrerequisite() { return this.prerequisite; } /** * Set if this CrawlURI is itself a prerequisite URI. * * @param prerequisite True if this CrawlURI is itself a prerequiste uri. */ public void setPrerequisite(boolean prerequisite) { this.prerequisite = prerequisite; } /** * @return This crawl URI as a string wrapped with 'CrawlURI(' + * ')'. */ public String getCrawlURIString() { if (this.cachedCrawlURIString == null) { synchronized (this) { if (this.cachedCrawlURIString == null) { this.cachedCrawlURIString = "CrawlURI(" + toString() + ")"; } } } return this.cachedCrawlURIString; } /** * Get the content type of this URI. * * @return Fetched URIs content type. May be null. */ public String getContentType() { return this.contentType; } /** * Set a fetched uri's content type. * * @param ct Contenttype. May be null. */ public void setContentType(String ct) { this.contentType = ct; } /** * Set the number of the ToeThread responsible for processing this uri. * * @param i the ToeThread number. */ public void setThreadNumber(int i) { threadNumber = i; } /** * Get the number of the ToeThread responsible for processing this uri. * * @return the ToeThread number. */ public int getThreadNumber() { return threadNumber; } /** * Increment the deferral count. * */ public void incrementDeferrals() { deferrals++; } /** * Get the deferral count. * * @return the deferral count. */ public int getDeferrals() { return deferrals; } /** * Remove all attributes set on this uri. * <p> * This methods removes the attribute list. */ public void stripToMinimal() { clearAList(); } /** * Get the size in bytes of this URI's recorded content, inclusive * of things like protocol headers. It is the responsibility of the * classes which fetch the URI to set this value accordingly -- it is * not calculated/verified within CrawlURI. * * This value is consulted in reporting/logging/writing-decisions. * * @see #setContentSize() * @return contentSize */ public long getContentSize(){ return contentSize; } /** * Make note of a non-fatal error, local to a particular Processor, * which should be logged somewhere, but allows processing to continue. * * This is how you add to the local-error log (the 'localized' in * the below is making an error local rather than global, not * making a swiss-french version of the error.). * * @param processorName Name of processor the exception was thrown * in. * @param ex Throwable to log. * @param message Extra message to log beyond exception message. */ public void addLocalizedError(final String processorName, final Throwable ex, final String message) { List<LocalizedError> localizedErrors; if (containsKey(A_LOCALIZED_ERRORS)) { @SuppressWarnings("unchecked") List<LocalizedError> temp // to prevent warning on cast = (List<LocalizedError>) getObject(A_LOCALIZED_ERRORS); localizedErrors = temp; } else { localizedErrors = new ArrayList<LocalizedError>(); putObject(A_LOCALIZED_ERRORS, localizedErrors); } localizedErrors.add(new LocalizedError(processorName, ex, message)); addAnnotation("le:" + getClassSimpleName(ex.getClass()) + "@" + processorName); } // TODO: Move to utils. protected String getClassSimpleName(final Class c) { String classname = c.getName(); int index = classname.lastIndexOf('.'); return ((index > 0 && (index + 1) < classname.length())? classname.substring(index + 1): classname); } /** * Add an annotation: an abbrieviated indication of something special * about this URI that need not be present in every crawl.log line, * but should be noted for future reference. * * @param annotation the annotation to add; should not contain * whitespace or a comma */ public void addAnnotation(String annotation) { String annotations; if(containsKey(A_ANNOTATIONS)) { annotations = getString(A_ANNOTATIONS); annotations += ","+annotation; } else { annotations = annotation; } putString(A_ANNOTATIONS,annotations); } /** * TODO: Implement truncation using booleans rather than as this * ugly String parse. * @return True if fetch was truncated. */ public boolean isTruncatedFetch() { return annotationContains(TRUNC_SUFFIX); } public boolean isLengthTruncatedFetch() { return annotationContains(LENGTH_TRUNC); } public boolean isTimeTruncatedFetch() { return annotationContains(TIMER_TRUNC); } public boolean isHeaderTruncatedFetch() { return annotationContains(HEADER_TRUNC); } protected boolean annotationContains(final String str2Find) { boolean result = false; if (!containsKey(A_ANNOTATIONS)) { return result; } String annotations = getString(A_ANNOTATIONS); if (annotations != null && annotations.length() > 0) { result = annotations.indexOf(str2Find) >= 0; } return result; } /** * Get the annotations set for this uri. * * @return the annotations set for this uri. */ public String getAnnotations() { return (containsKey(A_ANNOTATIONS))? getString(A_ANNOTATIONS): null; } /** * Get the embeded hop count. * * @return the embeded hop count. * @deprecated */ public int getEmbedHopCount() { return embedHopCount; } /** * Get the link hop count. * * @return the link hop count. * @deprecated */ public int getLinkHopCount() { return linkHopCount; } /** * Mark this uri as being a seed. * * * @deprecated */ public void markAsSeed() { linkHopCount = 0; embedHopCount = 0; } /** * Get the user agent to use for crawling this URI. * * If null the global setting should be used. * * @return user agent or null */ public String getUserAgent() { return userAgent; } /** * Set the user agent to use when crawling this URI. * * If not set the global settings should be used. * * @param string user agent to use */ public void setUserAgent(String string) { userAgent = string; } /** * Set which processor should be the next processor to process this uri * instead of using the default next processor. * * @param processorChain the processor chain to skip to. * @param processor the processor in the processor chain to skip to. */ public void skipToProcessor(ProcessorChain processorChain, Processor processor) { setNextProcessorChain(processorChain); setNextProcessor(processor); } /** * Set which processor chain should be processing this uri next. * * @param processorChain the processor chain to skip to. */ public void skipToProcessorChain(ProcessorChain processorChain) { setNextProcessorChain(processorChain); setNextProcessor(null); } /** * For completed HTTP transactions, the length of the content-body. * * @return For completed HTTP transactions, the length of the content-body. */ public long getContentLength() { if (this.contentLength < 0) { this.contentLength = (getHttpRecorder() != null)? getHttpRecorder().getResponseContentLength(): 0; } return this.contentLength; } /** * Get size of data recorded (transferred) * * @return recorded data size */ public long getRecordedSize() { return (getHttpRecorder() != null) ? getHttpRecorder().getRecordedInput().getSize() // if unavailable fall back on content-size : getContentSize(); } /** * Sets the 'content size' for the URI, which is considered inclusive * of all recorded material (such as protocol headers) or even material * 'virtually' considered (as in material from a previous fetch * confirmed unchanged with a server). (In contrast, content-length * matches the HTTP definition, that of the enclosed content-body.) * * Should be set by a fetcher or other processor as soon as the final * size of recorded content is known. Setting to an artificial/incorrect * value may affect other reporting/processing. * * @param l Content size. */ public void setContentSize(long l) { contentSize = l; } /** * If true then a link extractor has already claimed this CrawlURI and * performed link extraction on the document content. This does not * preclude other link extractors that may have an interest in this * CrawlURI from also doing link extraction but default behavior should * be to not run if link extraction has already been done. * * <p>There is an onus on link extractors to set this flag if they have * run. * * <p>The only extractor of the default Heritrix set that does not * respect this flag is * {@link org.archive.crawler.extractor.ExtractorHTTP}. * It runs against HTTP headers, not the document content. * * @return True if a processor has performed link extraction on this * CrawlURI * * @see #linkExtractorFinished() */ public boolean hasBeenLinkExtracted(){ return linkExtractorFinished; } /** * Note that link extraction has been performed on this CrawlURI. A processor * doing link extraction should invoke this method once it has finished it's * work. It should invoke it even if no links are extracted. It should only * invoke this method if the link extraction was performed on the document * body (not the HTTP headers etc.). * * @see #hasBeenLinkExtracted() */ public void linkExtractorFinished() { linkExtractorFinished = true; if(discardedOutlinks>0) { addAnnotation("dol:"+discardedOutlinks); } } /** * Notify CrawlURI it is about to be logged; opportunity * for self-annotation */ public void aboutToLog() { if (fetchAttempts>1) { addAnnotation(fetchAttempts+"t"); } } /** * Get the http recorder associated with this uri. * * @return Returns the httpRecorder. May be null but its set early in * FetchHttp so there is an issue if its null. */ public HttpRecorder getHttpRecorder() { return httpRecorder; } /** * Set the http recorder to be associated with this uri. * * @param httpRecorder The httpRecorder to set. */ public void setHttpRecorder(HttpRecorder httpRecorder) { this.httpRecorder = httpRecorder; } /** * Return true if this is a http transaction. * * TODO: Compound this and {@link #isPost()} method so that there is one * place to go to find out if get http, post http, ftp, dns. * * @return True if this is a http transaction. */ public boolean isHttpTransaction() { return containsKey(A_HTTP_TRANSACTION); } /** * Clean up after a run through the processing chain. * * Called on the end of processing chain by Frontier#finish. Null out any * state gathered during processing. */ public void processingCleanup() { this.httpRecorder = null; this.fetchStatus = S_UNATTEMPTED; this.setPrerequisite(false); this.contentSize = UNCALCULATED; this.contentLength = UNCALCULATED; // Clear 'links extracted' flag. this.linkExtractorFinished = false; // Clean the alist of all but registered permanent members. setAList(getPersistentAList()); } public AList getPersistentAList() { AList newAList = new HashtableAList(); // copy declared persistent keys if(alistPersistentMember!=null && alistPersistentMember.size() > 0) { newAList.copyKeysFrom(alistPersistentMember.iterator(), getAList()); } // also copy declared 'heritable' keys List heritableKeys = (List) getObject(A_HERITABLE_KEYS); if(heritableKeys!=null) { newAList.copyKeysFrom(heritableKeys.iterator(), getAList()); } return newAList; } /** * Make a <code>CrawlURI</code> from the passed <code>CandidateURI</code>. * * Its safe to pass a CrawlURI instance. In this case we just return it * as a result. Otherwise, we create new CrawlURI instance. * * @param caUri Candidate URI. * @param ordinal * @return A crawlURI made from the passed CandidateURI. */ public static CrawlURI from(CandidateURI caUri, long ordinal) { return (caUri instanceof CrawlURI)? (CrawlURI)caUri: new CrawlURI(caUri, ordinal); } /** * @param avatars Credential avatars to save off. */ private void setCredentialAvatars(Set avatars) { putObject(A_CREDENTIAL_AVATARS_KEY, avatars); } /** * @return Credential avatars. Null if none set. */ @SuppressWarnings("unchecked") public Set<CredentialAvatar> getCredentialAvatars() { return (Set)getObject(A_CREDENTIAL_AVATARS_KEY); } /**
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -