📄 crawluri.java

📁 高性能分词算法
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
    /**     * Get the user agent to use for crawling this URI.     *     * If null the global setting should be used.     *     * @return user agent or null     */    public String getUserAgent() {        return userAgent;    }    /**     * Set the user agent to use when crawling this URI.     *     * If not set the global settings should be used.     *     * @param string user agent to use     */    public void setUserAgent(String string) {        userAgent = string;    }    /**     * Set which processor should be the next processor to process this uri     * instead of using the default next processor.     *     * @param processorChain the processor chain to skip to.     * @param processor the processor in the processor chain to skip to.     */    public void skipToProcessor(ProcessorChain processorChain,            Processor processor) {        setNextProcessorChain(processorChain);        setNextProcessor(processor);    }    /**     * Set which processor chain should be processing this uri next.     *     * @param processorChain the processor chain to skip to.     */    public void skipToProcessorChain(ProcessorChain processorChain) {        setNextProcessorChain(processorChain);        setNextProcessor(null);    }    /**     * For completed HTTP transactions, the length of the content-body.     *     * @return For completed HTTP transactions, the length of the content-body.     */    public long getContentLength() {        if (this.contentLength < 0) {            this.contentLength = (getHttpRecorder() != null)?                getHttpRecorder().getResponseContentLength(): 0;        }        return this.contentLength;    }        /**     * Get size of data recorded (transferred)     *     * @return recorded data size     */    public long getRecordedSize() {        return (getHttpRecorder() != null)                    ?  getHttpRecorder().getRecordedInput().getSize()                    // if unavailable fall back on content-size                    : getContentSize();     }    /**     * Sets the 'content size' for the URI, which is considered inclusive     * of all recorded material (such as protocol headers) or even material     * 'virtually' considered (as in material from a previous fetch      * confirmed unchanged with a server). (In contrast, content-length      * matches the HTTP definition, that of the enclosed content-body.)     *      * Should be set by a fetcher or other processor as soon as the final      * size of recorded content is known. Setting to an artificial/incorrect     * value may affect other reporting/processing.      *      * @param l Content size.     */    public void setContentSize(long l) {        contentSize = l;    }    /**     * If true then a link extractor has already claimed this CrawlURI and     * performed link extraction on the document content. This does not     * preclude other link extractors that may have an interest in this     * CrawlURI from also doing link extraction but default behavior should     * be to not run if link extraction has already been done.     *      * <p>There is an onus on link extractors to set this flag if they have     * run.     *      * <p>The only extractor of the default Heritrix set that does not     * respect this flag is     * {@link org.archive.crawler.extractor.ExtractorHTTP}.     * It runs against HTTP headers, not the document content.     *      * @return True if a processor has performed link extraction on this     * CrawlURI     *     * @see #linkExtractorFinished()     */    public boolean hasBeenLinkExtracted(){        return linkExtractorFinished;    }    /**     * Note that link extraction has been performed on this CrawlURI. A processor     * doing link extraction should invoke this method once it has finished it's     * work. It should invoke it even if no links are extracted. It should only     * invoke this method if the link extraction was performed on the document     * body (not the HTTP headers etc.).     *     * @see #hasBeenLinkExtracted()     */    public void linkExtractorFinished() {        linkExtractorFinished = true;        if(discardedOutlinks>0) {            addAnnotation("dol:"+discardedOutlinks);        }    }    /**     * Notify CrawlURI it is about to be logged; opportunity     * for self-annotation     */    public void aboutToLog() {        if (fetchAttempts>1) {            addAnnotation(fetchAttempts+"t");        }    }    /**     * Get the http recorder associated with this uri.     *     * @return Returns the httpRecorder.  May be null but its set early in     * FetchHttp so there is an issue if its null.     */    public HttpRecorder getHttpRecorder() {        return httpRecorder;    }    /**     * Set the http recorder to be associated with this uri.     *     * @param httpRecorder The httpRecorder to set.     */    public void setHttpRecorder(HttpRecorder httpRecorder) {        this.httpRecorder = httpRecorder;    }    /**     * Return true if this is a http transaction.     *     * TODO: Compound this and {@link #isPost()} method so that there is one     * place to go to find out if get http, post http, ftp, dns.     *     * @return True if this is a http transaction.     */    public boolean isHttpTransaction() {        return containsKey(A_HTTP_TRANSACTION);    }    /**     * Clean up after a run through the processing chain.     *     * Called on the end of processing chain by Frontier#finish.  Null out any     * state gathered during processing.     */    public void processingCleanup() {        this.httpRecorder = null;        this.fetchStatus = S_UNATTEMPTED;        this.setPrerequisite(false);        this.contentSize = UNCALCULATED;        this.contentLength = UNCALCULATED;        // Clear 'links extracted' flag.        this.linkExtractorFinished = false;        // Clean the alist of all but registered permanent members.        setAList(getPersistentAList());    }        public AList getPersistentAList() {        AList newAList = new HashtableAList();        // copy declared persistent keys        if(alistPersistentMember!=null && alistPersistentMember.size() > 0) {            newAList.copyKeysFrom(alistPersistentMember.iterator(), getAList());        }         // also copy declared 'heritable' keys        List heritableKeys = (List) getObject(A_HERITABLE_KEYS);        if(heritableKeys!=null) {            newAList.copyKeysFrom(heritableKeys.iterator(), getAList());        }        return newAList;    }    /**     * Make a <code>CrawlURI</code> from the passed <code>CandidateURI</code>.     *     * Its safe to pass a CrawlURI instance.  In this case we just return it     * as a result. Otherwise, we create new CrawlURI instance.     *     * @param caUri Candidate URI.     * @param ordinal     * @return A crawlURI made from the passed CandidateURI.     */    public static CrawlURI from(CandidateURI caUri, long ordinal) {        return (caUri instanceof CrawlURI)?            (CrawlURI)caUri: new CrawlURI(caUri, ordinal);    }    /**     * @param avatars Credential avatars to save off.     */    private void setCredentialAvatars(Set avatars) {        putObject(A_CREDENTIAL_AVATARS_KEY, avatars);    }    /**     * @return Credential avatars.  Null if none set.     */    @SuppressWarnings("unchecked")    public Set<CredentialAvatar> getCredentialAvatars() {        return (Set)getObject(A_CREDENTIAL_AVATARS_KEY);    }    /**     * @return True if there are avatars attached to this instance.     */    public boolean hasCredentialAvatars() {        return getCredentialAvatars() != null &&            getCredentialAvatars().size() > 0;    }    /**     * Add an avatar.     *     * We do lazy instantiation.     *     * @param ca Credential avatar to add to set of avatars.     */    public void addCredentialAvatar(CredentialAvatar ca) {        Set<CredentialAvatar> avatars = getCredentialAvatars();        if (avatars == null) {            avatars = new HashSet<CredentialAvatar>();            setCredentialAvatars(avatars);        }        avatars.add(ca);    }    /**     * Remove all credential avatars from this crawl uri.     */    public void removeCredentialAvatars() {        if (hasCredentialAvatars()) {            remove(A_CREDENTIAL_AVATARS_KEY);        }    }    /**     * Remove all credential avatars from this crawl uri.     * @param ca Avatar to remove.     * @return True if we removed passed parameter.  False if no operation     * performed.     */    public boolean removeCredentialAvatar(CredentialAvatar ca) {        boolean result = false;        Set avatars = getCredentialAvatars();        if (avatars != null && avatars.size() > 0) {            result = avatars.remove(ca);        }        return result;    }    /**     * Ask this URI if it was a success or not.     *     * Only makes sense to call this method after execution of     * HttpMethod#execute. Regard any status larger then 0 as success     * except for below caveat regarding 401s.  Use {@link #is2XXSuccess()} if     * looking for a status code in the 200 range.     *     * <p>401s caveat: If any rfc2617 credential data present and we got a 401     * assume it got loaded in FetchHTTP on expectation that we're to go around     * the processing chain again. Report this condition as a failure so we     * get another crack at the processing chain only this time we'll be making     * use of the loaded credential data.     *     * @return True if ths URI has been successfully processed.     * @see #is2XXSuccess()     */    public boolean isSuccess() {        boolean result = false;        int statusCode = this.fetchStatus;        if (statusCode == HttpStatus.SC_UNAUTHORIZED &&            hasRfc2617CredentialAvatar()) {            result = false;        } else {            result = (statusCode > 0);        }        return result;    }        /**     * @return True if status code is in the 2xx range.     * @see #isSuccess()     */    public boolean is2XXSuccess() {    	return this.fetchStatus >= 200 && this.fetchStatus < 300;    }    /**	 * @return True if we have an rfc2617 payload.	 */	public boolean hasRfc2617CredentialAvatar() {	    boolean result = false;	    Set avatars = getCredentialAvatars();	    if (avatars != null && avatars.size() > 0) {	        for (Iterator i = avatars.iterator(); i.hasNext();) {	            if (((CredentialAvatar)i.next()).	                match(Rfc2617Credential.class)) {	                result = true;	                break;	            }	        }	    }        return result;	}    /**     * Set whether this URI should be fetched by sending a HTTP POST request.     * Else a HTTP GET request will be used.     *     * @param b Set whether this curi is to be POST'd.  Else its to be GET'd.     */    public void setPost(boolean b) {        this.post = b;    }    /**     * Returns true if this URI should be fetched by sending a HTTP POST request.     *     *     * TODO: Compound this and {@link #isHttpTransaction()} method so that there     * is one place to go to find out if get http, post http, ftp, dns.     *     * @return Returns is this CrawlURI instance is to be posted.     */    public boolean isPost() {        return this.post;    }
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -