📄 crawluri.java

📁 高性能分词算法
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
     * Increment the number of attempts at getting the document referenced by     * this URI.     *     * @return the number of attempts at getting the document referenced by this     *         URI.     */    public int incrementFetchAttempts() {        // TODO: rename, this is actually processing-loop-attempts        return fetchAttempts++;    }    /**     * Reset fetchAttempts counter.     */    public void resetFetchAttempts() {        this.fetchAttempts = 0;    }    /**     * Reset deferrals counter.     */    public void resetDeferrals() {        this.deferrals = 0;    }    /**     * Get the next processor to process this URI.     *     * @return the processor that should process this URI next.     */    public Processor nextProcessor() {        return nextProcessor;    }    /**     * Get the processor chain that should be processing this URI after the     * current chain is finished with it.     *     * @return the next processor chain to process this URI.     */    public ProcessorChain nextProcessorChain() {        return nextProcessorChain;    }    /**     * Set the next processor to process this URI.     *     * @param processor the next processor to process this URI.     */    public void setNextProcessor(Processor processor) {        nextProcessor = processor;    }    /**     * Set the next processor chain to process this URI.     *     * @param nextProcessorChain the next processor chain to process this URI.     */    public void setNextProcessorChain(ProcessorChain nextProcessorChain) {        this.nextProcessorChain = nextProcessorChain;    }    /**     * Do all actions associated with setting a <code>CrawlURI</code> as     * requiring a prerequisite.     *     * @param lastProcessorChain Last processor chain reference.  This chain is     * where this <code>CrawlURI</code> goes next.     * @param preq Object to set a prerequisite.     * @throws URIException     */    public void markPrerequisite(String preq,            ProcessorChain lastProcessorChain) throws URIException {        Link link = createLink(preq,Link.PREREQ_MISC,Link.PREREQ_HOP);        setPrerequisiteUri(link);        incrementDeferrals();        setFetchStatus(S_DEFERRED);        skipToProcessorChain(lastProcessorChain);    }    /**     * Set a prerequisite for this URI.     * <p>     * A prerequisite is a URI that must be crawled before this URI can be     * crawled.     *     * @param link Link to set as prereq.     */    public void setPrerequisiteUri(Object link) {        putObject(A_PREREQUISITE_URI, link);    }    /**     * Get the prerequisite for this URI.     * <p>     * A prerequisite is a URI that must be crawled before this URI can be     * crawled.     *     * @return the prerequisite for this URI or null if no prerequisite.     */    public Object getPrerequisiteUri() {        return getObject(A_PREREQUISITE_URI);    }        /**     * @return True if this CrawlURI has a prerequisite.     */    public boolean hasPrerequisiteUri() {        return containsKey(A_PREREQUISITE_URI);    }    /**     * Returns true if this CrawlURI is a prerequisite.     *     * @return true if this CrawlURI is a prerequisite.     */    public boolean isPrerequisite() {        return this.prerequisite;    }    /**     * Set if this CrawlURI is itself a prerequisite URI.     *     * @param prerequisite True if this CrawlURI is itself a prerequiste uri.     */    public void setPrerequisite(boolean prerequisite) {        this.prerequisite = prerequisite;    }    /**     * @return This crawl URI as a string wrapped with 'CrawlURI(' +     * ')'.     */    public String getCrawlURIString() {        if (this.cachedCrawlURIString == null) {            synchronized (this) {                if (this.cachedCrawlURIString == null) {                    this.cachedCrawlURIString =                        "CrawlURI(" + toString() + ")";                }            }        }        return this.cachedCrawlURIString;    }    /**     * Get the content type of this URI.     *     * @return Fetched URIs content type.  May be null.     */    public String getContentType() {        return this.contentType;    }    /**     * Set a fetched uri's content type.     *     * @param ct Contenttype.  May be null.     */    public void setContentType(String ct) {        this.contentType = ct;    }    /**     * Set the number of the ToeThread responsible for processing this uri.     *     * @param i the ToeThread number.     */    public void setThreadNumber(int i) {        threadNumber = i;    }    /**     * Get the number of the ToeThread responsible for processing this uri.     *     * @return the ToeThread number.     */    public int getThreadNumber() {        return threadNumber;    }    /**     * Increment the deferral count.     *     */    public void incrementDeferrals() {        deferrals++;    }    /**     * Get the deferral count.     *     * @return the deferral count.     */    public int getDeferrals() {        return deferrals;    }    /**     * Remove all attributes set on this uri.     * <p>     * This methods removes the attribute list.     */    public void stripToMinimal() {        clearAList();    }    /**      * Get the size in bytes of this URI's recorded content, inclusive     * of things like protocol headers. It is the responsibility of the      * classes which fetch the URI to set this value accordingly -- it is      * not calculated/verified within CrawlURI.      *      * This value is consulted in reporting/logging/writing-decisions.     *      * @see #setContentSize()     * @return contentSize     */    public long getContentSize(){        return contentSize;    }    /**     * Make note of a non-fatal error, local to a particular Processor,     * which should be logged somewhere, but allows processing to continue.     *     * This is how you add to the local-error log (the 'localized' in     * the below is making an error local rather than global, not     * making a swiss-french version of the error.).     *      * @param processorName Name of processor the exception was thrown     * in.     * @param ex Throwable to log.     * @param message Extra message to log beyond exception message.     */    public void addLocalizedError(final String processorName,            final Throwable ex, final String message) {        List<LocalizedError> localizedErrors;        if (containsKey(A_LOCALIZED_ERRORS)) {            @SuppressWarnings("unchecked")            List<LocalizedError> temp // to prevent warning on cast             = (List<LocalizedError>) getObject(A_LOCALIZED_ERRORS);            localizedErrors = temp;        } else {            localizedErrors = new ArrayList<LocalizedError>();            putObject(A_LOCALIZED_ERRORS, localizedErrors);        }        localizedErrors.add(new LocalizedError(processorName, ex, message));        addAnnotation("le:" + getClassSimpleName(ex.getClass()) + "@" +            processorName);    }        // TODO: Move to utils.    protected String getClassSimpleName(final Class c) {        String classname = c.getName();        int index = classname.lastIndexOf('.');        return ((index > 0 && (index + 1) < classname.length())?            classname.substring(index + 1): classname);    }    /**     * Add an annotation: an abbrieviated indication of something special     * about this URI that need not be present in every crawl.log line,     * but should be noted for future reference.      *     * @param annotation the annotation to add; should not contain      * whitespace or a comma     */    public void addAnnotation(String annotation) {        String annotations;        if(containsKey(A_ANNOTATIONS)) {            annotations = getString(A_ANNOTATIONS);            annotations += ","+annotation;        } else {            annotations = annotation;        }        putString(A_ANNOTATIONS,annotations);    }        /**     * TODO: Implement truncation using booleans rather than as this     * ugly String parse.     * @return True if fetch was truncated.     */    public boolean isTruncatedFetch() {        return annotationContains(TRUNC_SUFFIX);    }        public boolean isLengthTruncatedFetch() {        return annotationContains(LENGTH_TRUNC);    }        public boolean isTimeTruncatedFetch() {        return annotationContains(TIMER_TRUNC);    }        public boolean isHeaderTruncatedFetch() {        return annotationContains(HEADER_TRUNC);    }        protected boolean annotationContains(final String str2Find) {        boolean result = false;        if (!containsKey(A_ANNOTATIONS)) {            return result;        }        String annotations = getString(A_ANNOTATIONS);        if (annotations != null && annotations.length() > 0) {            result = annotations.indexOf(str2Find) >= 0;        }        return result;    }    /**     * Get the annotations set for this uri.     *     * @return the annotations set for this uri.     */    public String getAnnotations() {        return (containsKey(A_ANNOTATIONS))?            getString(A_ANNOTATIONS): null;    }    /**     * Get the embeded hop count.     *     * @return the embeded hop count.     * @deprecated      */    public int getEmbedHopCount() {        return embedHopCount;    }    /**     * Get the link hop count.     *     * @return the link hop count.     * @deprecated      */    public int getLinkHopCount() {        return linkHopCount;    }    /**     * Mark this uri as being a seed.     *     *     * @deprecated      */    public void markAsSeed() {        linkHopCount = 0;        embedHopCount = 0;    }
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -