📄 crawluri.java

📁 Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
    public boolean isPrerequisite() {        return this.prerequisite;    }    /**     * Set if this CrawlURI is itself a prerequisite URI.     *     * @param prerequisite True if this CrawlURI is itself a prerequiste uri.     */    public void setPrerequisite(boolean prerequisite) {        this.prerequisite = prerequisite;    }    /**     * @return This crawl URI as a string wrapped with 'CrawlURI(' +     * ')'.     */    public String getCrawlURIString() {        if (this.cachedCrawlURIString == null) {            synchronized (this) {                if (this.cachedCrawlURIString == null) {                    this.cachedCrawlURIString =                        "CrawlURI(" + toString() + ")";                }            }        }        return this.cachedCrawlURIString;    }    /**     * Get the content type of this URI.     *     * @return Fetched URIs content type.  May be null.     */    public String getContentType() {        return this.contentType;    }    /**     * Set a fetched uri's content type.     *     * @param ct Contenttype.  May be null.     */    public void setContentType(String ct) {        this.contentType = ct;    }    /**     * Set the number of the ToeThread responsible for processing this uri.     *     * @param i the ToeThread number.     */    public void setThreadNumber(int i) {        threadNumber = i;    }    /**     * Get the number of the ToeThread responsible for processing this uri.     *     * @return the ToeThread number.     */    public int getThreadNumber() {        return threadNumber;    }    /**     * Increment the deferral count.     *     */    public void incrementDeferrals() {        deferrals++;    }    /**     * Get the deferral count.     *     * @return the deferral count.     */    public int getDeferrals() {        return deferrals;    }    /**     * Remove all attributes set on this uri.     * <p>     * This methods removes the attribute list.     */    public void stripToMinimal() {        clearAList();    }    /**      * Get the size in bytes of this URI's recorded content, inclusive     * of things like protocol headers. It is the responsibility of the      * classes which fetch the URI to set this value accordingly -- it is      * not calculated/verified within CrawlURI.      *      * This value is consulted in reporting/logging/writing-decisions.     *      * @see #setContentSize()     * @return contentSize     */    public long getContentSize(){        return contentSize;    }    /**     * Make note of a non-fatal error, local to a particular Processor,     * which should be logged somewhere, but allows processing to continue.     *     * This is how you add to the local-error log (the 'localized' in     * the below is making an error local rather than global, not     * making a swiss-french version of the error.).     *      * @param processorName Name of processor the exception was thrown     * in.     * @param ex Throwable to log.     * @param message Extra message to log beyond exception message.     */    public void addLocalizedError(final String processorName,            final Throwable ex, final String message) {        List<LocalizedError> localizedErrors;        if (containsKey(A_LOCALIZED_ERRORS)) {            @SuppressWarnings("unchecked")            List<LocalizedError> temp // to prevent warning on cast             = (List<LocalizedError>) getObject(A_LOCALIZED_ERRORS);            localizedErrors = temp;        } else {            localizedErrors = new ArrayList<LocalizedError>();            putObject(A_LOCALIZED_ERRORS, localizedErrors);        }        localizedErrors.add(new LocalizedError(processorName, ex, message));        addAnnotation("le:" + getClassSimpleName(ex.getClass()) + "@" +            processorName);    }        // TODO: Move to utils.    protected String getClassSimpleName(final Class c) {        String classname = c.getName();        int index = classname.lastIndexOf('.');        return ((index > 0 && (index + 1) < classname.length())?            classname.substring(index + 1): classname);    }    /**     * Add an annotation: an abbrieviated indication of something special     * about this URI that need not be present in every crawl.log line,     * but should be noted for future reference.      *     * @param annotation the annotation to add; should not contain      * whitespace or a comma     */    public void addAnnotation(String annotation) {        String annotations;        if(containsKey(A_ANNOTATIONS)) {            annotations = getString(A_ANNOTATIONS);            annotations += ","+annotation;        } else {            annotations = annotation;        }        putString(A_ANNOTATIONS,annotations);    }        /**     * TODO: Implement truncation using booleans rather than as this     * ugly String parse.     * @return True if fetch was truncated.     */    public boolean isTruncatedFetch() {        return annotationContains(TRUNC_SUFFIX);    }        public boolean isLengthTruncatedFetch() {        return annotationContains(LENGTH_TRUNC);    }        public boolean isTimeTruncatedFetch() {        return annotationContains(TIMER_TRUNC);    }        public boolean isHeaderTruncatedFetch() {        return annotationContains(HEADER_TRUNC);    }        protected boolean annotationContains(final String str2Find) {        boolean result = false;        if (!containsKey(A_ANNOTATIONS)) {            return result;        }        String annotations = getString(A_ANNOTATIONS);        if (annotations != null && annotations.length() > 0) {            result = annotations.indexOf(str2Find) >= 0;        }        return result;    }    /**     * Get the annotations set for this uri.     *     * @return the annotations set for this uri.     */    public String getAnnotations() {        return (containsKey(A_ANNOTATIONS))?            getString(A_ANNOTATIONS): null;    }    /**     * Get the embeded hop count.     *     * @return the embeded hop count.     * @deprecated      */    public int getEmbedHopCount() {        return embedHopCount;    }    /**     * Get the link hop count.     *     * @return the link hop count.     * @deprecated      */    public int getLinkHopCount() {        return linkHopCount;    }    /**     * Mark this uri as being a seed.     *     *     * @deprecated      */    public void markAsSeed() {        linkHopCount = 0;        embedHopCount = 0;    }    /**     * Get the user agent to use for crawling this URI.     *     * If null the global setting should be used.     *     * @return user agent or null     */    public String getUserAgent() {        return userAgent;    }    /**     * Set the user agent to use when crawling this URI.     *     * If not set the global settings should be used.     *     * @param string user agent to use     */    public void setUserAgent(String string) {        userAgent = string;    }    /**     * Set which processor should be the next processor to process this uri     * instead of using the default next processor.     *     * @param processorChain the processor chain to skip to.     * @param processor the processor in the processor chain to skip to.     */    public void skipToProcessor(ProcessorChain processorChain,            Processor processor) {        setNextProcessorChain(processorChain);        setNextProcessor(processor);    }    /**     * Set which processor chain should be processing this uri next.     *     * @param processorChain the processor chain to skip to.     */    public void skipToProcessorChain(ProcessorChain processorChain) {        setNextProcessorChain(processorChain);        setNextProcessor(null);    }    /**     * For completed HTTP transactions, the length of the content-body.     *     * @return For completed HTTP transactions, the length of the content-body.     */    public long getContentLength() {        if (this.contentLength < 0) {            this.contentLength = (getHttpRecorder() != null)?                getHttpRecorder().getResponseContentLength(): 0;        }        return this.contentLength;    }        /**     * Get size of data recorded (transferred)     *     * @return recorded data size     */    public long getRecordedSize() {        return (getHttpRecorder() != null)                    ?  getHttpRecorder().getRecordedInput().getSize()                    // if unavailable fall back on content-size                    : getContentSize();     }    /**     * Sets the 'content size' for the URI, which is considered inclusive     * of all recorded material (such as protocol headers) or even material     * 'virtually' considered (as in material from a previous fetch      * confirmed unchanged with a server). (In contrast, content-length      * matches the HTTP definition, that of the enclosed content-body.)     *      * Should be set by a fetcher or other processor as soon as the final      * size of recorded content is known. Setting to an artificial/incorrect     * value may affect other reporting/processing.      *      * @param l Content size.     */    public void setContentSize(long l) {        contentSize = l;    }    /**     * If true then a link extractor has already claimed this CrawlURI and     * performed link extraction on the document content. This does not     * preclude other link extractors that may have an interest in this     * CrawlURI from also doing link extraction but default behavior should     * be to not run if link extraction has already been done.     *      * <p>There is an onus on link extractors to set this flag if they have     * run.     *      * <p>The only extractor of the default Heritrix set that does not     * respect this flag is     * {@link org.archive.crawler.extractor.ExtractorHTTP}.     * It runs against HTTP headers, not the document content.     *      * @return True if a processor has performed link extraction on this     * CrawlURI     *     * @see #linkExtractorFinished()     */    public boolean hasBeenLinkExtracted(){        return linkExtractorFinished;    }    /**     * Note that link extraction has been performed on this CrawlURI. A processor     * doing link extraction should invoke this method once it has finished it's     * work. It should invoke it even if no links are extracted. It should only     * invoke this method if the link extraction was performed on the document     * body (not the HTTP headers etc.).     *     * @see #hasBeenLinkExtracted()     */    public void linkExtractorFinished() {        linkExtractorFinished = true;        if(discardedOutlinks>0) {            addAnnotation("dol:"+discardedOutlinks);        }    }    /**     * Notify CrawlURI it is about to be logged; opportunity     * for self-annotation     */    public void aboutToLog() {        if (fetchAttempts>1) {            addAnnotation(fetchAttempts+"t");        }    }    /**     * Get the http recorder associated with this uri.     *     * @return Returns the httpRecorder.  May be null but its set early in     * FetchHttp so there is an issue if its null.     */    public HttpRecorder getHttpRecorder() {        return httpRecorder;    }    /**     * Set the http recorder to be associated with this uri.     *     * @param httpRecorder The httpRecorder to set.     */    public void setHttpRecorder(HttpRecorder httpRecorder) {        this.httpRecorder = httpRecorder;    }    /**     * Return true if this is a http transaction.     *     * TODO: Compound this and {@link #isPost()} method so that there is one     * place to go to find out if get http, post http, ftp, dns.     *     * @return True if this is a http transaction.     */    public boolean isHttpTransaction() {        return containsKey(A_HTTP_TRANSACTION);    }    /**     * Clean up after a run through the processing chain.     *     * Called on the end of processing chain by Frontier#finish.  Null out any     * state gathered during processing.     */    public void processingCleanup() {        this.httpRecorder = null;        this.fetchStatus = S_UNATTEMPTED;        this.setPrerequisite(false);        this.contentSize = UNCALCULATED;        this.contentLength = UNCALCULATED;        // Clear 'links extracted' flag.        this.linkExtractorFinished = false;        // Clean the alist of all but registered permanent members.        setAList(getPersistentAList());    }        public AList getPersistentAList() {        AList newAList = new HashtableAList();        // copy declared persistent keys        if(alistPersistentMember!=null && alistPersistentMember.size() > 0) {            newAList.copyKeysFrom(alistPersistentMember.iterator(), getAList());        }         // also copy declared 'heritable' keys        List heritableKeys = (List) getObject(A_HERITABLE_KEYS);        if(heritableKeys!=null) {            newAList.copyKeysFrom(heritableKeys.iterator(), getAList());        }        return newAList;    }    /**     * Make a <code>CrawlURI</code> from the passed <code>CandidateURI</code>.     *     * Its safe to pass a CrawlURI instance.  In this case we just return it     * as a result. Otherwise, we create new CrawlURI instance.     *     * @param caUri Candidate URI.     * @param ordinal     * @return A crawlURI made from the passed CandidateURI.     */    public static CrawlURI from(CandidateURI caUri, long ordinal) {        return (caUri instanceof CrawlURI)?            (CrawlURI)caUri: new CrawlURI(caUri, ordinal);    }    /**     * @param avatars Credential avatars to save off.     */    private void setCredentialAvatars(Set avatars) {        putObject(A_CREDENTIAL_AVATARS_KEY, avatars);    }    /**     * @return Credential avatars.  Null if none set.     */    @SuppressWarnings("unchecked")    public Set<CredentialAvatar> getCredentialAvatars() {        return (Set)getObject(A_CREDENTIAL_AVATARS_KEY);    }    /**
💿 文件大小 10016 K
👤 上传用户 qqpp2q
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#Heritrix #robots #META #web
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -