📄 crawluri.java
字号:
/** * Get the user agent to use for crawling this URI. * * If null the global setting should be used. * * @return user agent or null */ public String getUserAgent() { return userAgent; } /** * Set the user agent to use when crawling this URI. * * If not set the global settings should be used. * * @param string user agent to use */ public void setUserAgent(String string) { userAgent = string; } /** * Set which processor should be the next processor to process this uri * instead of using the default next processor. * * @param processorChain the processor chain to skip to. * @param processor the processor in the processor chain to skip to. */ public void skipToProcessor(ProcessorChain processorChain, Processor processor) { setNextProcessorChain(processorChain); setNextProcessor(processor); } /** * Set which processor chain should be processing this uri next. * * @param processorChain the processor chain to skip to. */ public void skipToProcessorChain(ProcessorChain processorChain) { setNextProcessorChain(processorChain); setNextProcessor(null); } /** * For completed HTTP transactions, the length of the content-body. * * @return For completed HTTP transactions, the length of the content-body. */ public long getContentLength() { if (this.contentLength < 0) { this.contentLength = (getHttpRecorder() != null)? getHttpRecorder().getResponseContentLength(): 0; } return this.contentLength; } /** * Get size of data recorded (transferred) * * @return recorded data size */ public long getRecordedSize() { return (getHttpRecorder() != null) ? getHttpRecorder().getRecordedInput().getSize() // if unavailable fall back on content-size : getContentSize(); } /** * Sets the 'content size' for the URI, which is considered inclusive * of all recorded material (such as protocol headers) or even material * 'virtually' considered (as in material from a previous fetch * confirmed unchanged with a server). (In contrast, content-length * matches the HTTP definition, that of the enclosed content-body.) * * Should be set by a fetcher or other processor as soon as the final * size of recorded content is known. Setting to an artificial/incorrect * value may affect other reporting/processing. * * @param l Content size. */ public void setContentSize(long l) { contentSize = l; } /** * If true then a link extractor has already claimed this CrawlURI and * performed link extraction on the document content. This does not * preclude other link extractors that may have an interest in this * CrawlURI from also doing link extraction but default behavior should * be to not run if link extraction has already been done. * * <p>There is an onus on link extractors to set this flag if they have * run. * * <p>The only extractor of the default Heritrix set that does not * respect this flag is * {@link org.archive.crawler.extractor.ExtractorHTTP}. * It runs against HTTP headers, not the document content. * * @return True if a processor has performed link extraction on this * CrawlURI * * @see #linkExtractorFinished() */ public boolean hasBeenLinkExtracted(){ return linkExtractorFinished; } /** * Note that link extraction has been performed on this CrawlURI. A processor * doing link extraction should invoke this method once it has finished it's * work. It should invoke it even if no links are extracted. It should only * invoke this method if the link extraction was performed on the document * body (not the HTTP headers etc.). * * @see #hasBeenLinkExtracted() */ public void linkExtractorFinished() { linkExtractorFinished = true; if(discardedOutlinks>0) { addAnnotation("dol:"+discardedOutlinks); } } /** * Notify CrawlURI it is about to be logged; opportunity * for self-annotation */ public void aboutToLog() { if (fetchAttempts>1) { addAnnotation(fetchAttempts+"t"); } } /** * Get the http recorder associated with this uri. * * @return Returns the httpRecorder. May be null but its set early in * FetchHttp so there is an issue if its null. */ public HttpRecorder getHttpRecorder() { return httpRecorder; } /** * Set the http recorder to be associated with this uri. * * @param httpRecorder The httpRecorder to set. */ public void setHttpRecorder(HttpRecorder httpRecorder) { this.httpRecorder = httpRecorder; } /** * Return true if this is a http transaction. * * TODO: Compound this and {@link #isPost()} method so that there is one * place to go to find out if get http, post http, ftp, dns. * * @return True if this is a http transaction. */ public boolean isHttpTransaction() { return containsKey(A_HTTP_TRANSACTION); } /** * Clean up after a run through the processing chain. * * Called on the end of processing chain by Frontier#finish. Null out any * state gathered during processing. */ public void processingCleanup() { this.httpRecorder = null; this.fetchStatus = S_UNATTEMPTED; this.setPrerequisite(false); this.contentSize = UNCALCULATED; this.contentLength = UNCALCULATED; // Clear 'links extracted' flag. this.linkExtractorFinished = false; // Clean the alist of all but registered permanent members. setAList(getPersistentAList()); } public AList getPersistentAList() { AList newAList = new HashtableAList(); // copy declared persistent keys if(alistPersistentMember!=null && alistPersistentMember.size() > 0) { newAList.copyKeysFrom(alistPersistentMember.iterator(), getAList()); } // also copy declared 'heritable' keys List heritableKeys = (List) getObject(A_HERITABLE_KEYS); if(heritableKeys!=null) { newAList.copyKeysFrom(heritableKeys.iterator(), getAList()); } return newAList; } /** * Make a <code>CrawlURI</code> from the passed <code>CandidateURI</code>. * * Its safe to pass a CrawlURI instance. In this case we just return it * as a result. Otherwise, we create new CrawlURI instance. * * @param caUri Candidate URI. * @param ordinal * @return A crawlURI made from the passed CandidateURI. */ public static CrawlURI from(CandidateURI caUri, long ordinal) { return (caUri instanceof CrawlURI)? (CrawlURI)caUri: new CrawlURI(caUri, ordinal); } /** * @param avatars Credential avatars to save off. */ private void setCredentialAvatars(Set avatars) { putObject(A_CREDENTIAL_AVATARS_KEY, avatars); } /** * @return Credential avatars. Null if none set. */ @SuppressWarnings("unchecked") public Set<CredentialAvatar> getCredentialAvatars() { return (Set)getObject(A_CREDENTIAL_AVATARS_KEY); } /** * @return True if there are avatars attached to this instance. */ public boolean hasCredentialAvatars() { return getCredentialAvatars() != null && getCredentialAvatars().size() > 0; } /** * Add an avatar. * * We do lazy instantiation. * * @param ca Credential avatar to add to set of avatars. */ public void addCredentialAvatar(CredentialAvatar ca) { Set<CredentialAvatar> avatars = getCredentialAvatars(); if (avatars == null) { avatars = new HashSet<CredentialAvatar>(); setCredentialAvatars(avatars); } avatars.add(ca); } /** * Remove all credential avatars from this crawl uri. */ public void removeCredentialAvatars() { if (hasCredentialAvatars()) { remove(A_CREDENTIAL_AVATARS_KEY); } } /** * Remove all credential avatars from this crawl uri. * @param ca Avatar to remove. * @return True if we removed passed parameter. False if no operation * performed. */ public boolean removeCredentialAvatar(CredentialAvatar ca) { boolean result = false; Set avatars = getCredentialAvatars(); if (avatars != null && avatars.size() > 0) { result = avatars.remove(ca); } return result; } /** * Ask this URI if it was a success or not. * * Only makes sense to call this method after execution of * HttpMethod#execute. Regard any status larger then 0 as success * except for below caveat regarding 401s. Use {@link #is2XXSuccess()} if * looking for a status code in the 200 range. * * <p>401s caveat: If any rfc2617 credential data present and we got a 401 * assume it got loaded in FetchHTTP on expectation that we're to go around * the processing chain again. Report this condition as a failure so we * get another crack at the processing chain only this time we'll be making * use of the loaded credential data. * * @return True if ths URI has been successfully processed. * @see #is2XXSuccess() */ public boolean isSuccess() { boolean result = false; int statusCode = this.fetchStatus; if (statusCode == HttpStatus.SC_UNAUTHORIZED && hasRfc2617CredentialAvatar()) { result = false; } else { result = (statusCode > 0); } return result; } /** * @return True if status code is in the 2xx range. * @see #isSuccess() */ public boolean is2XXSuccess() { return this.fetchStatus >= 200 && this.fetchStatus < 300; } /** * @return True if we have an rfc2617 payload. */ public boolean hasRfc2617CredentialAvatar() { boolean result = false; Set avatars = getCredentialAvatars(); if (avatars != null && avatars.size() > 0) { for (Iterator i = avatars.iterator(); i.hasNext();) { if (((CredentialAvatar)i.next()). match(Rfc2617Credential.class)) { result = true; break; } } } return result; } /** * Set whether this URI should be fetched by sending a HTTP POST request. * Else a HTTP GET request will be used. * * @param b Set whether this curi is to be POST'd. Else its to be GET'd. */ public void setPost(boolean b) { this.post = b; } /** * Returns true if this URI should be fetched by sending a HTTP POST request. * * * TODO: Compound this and {@link #isHttpTransaction()} method so that there * is one place to go to find out if get http, post http, ftp, dns. * * @return Returns is this CrawlURI instance is to be posted. */ public boolean isPost() { return this.post; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -