📄 crawluri.java

📁 Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
     * @return True if there are avatars attached to this instance.     */    public boolean hasCredentialAvatars() {        return getCredentialAvatars() != null &&            getCredentialAvatars().size() > 0;    }    /**     * Add an avatar.     *     * We do lazy instantiation.     *     * @param ca Credential avatar to add to set of avatars.     */    public void addCredentialAvatar(CredentialAvatar ca) {        Set<CredentialAvatar> avatars = getCredentialAvatars();        if (avatars == null) {            avatars = new HashSet<CredentialAvatar>();            setCredentialAvatars(avatars);        }        avatars.add(ca);    }    /**     * Remove all credential avatars from this crawl uri.     */    public void removeCredentialAvatars() {        if (hasCredentialAvatars()) {            remove(A_CREDENTIAL_AVATARS_KEY);        }    }    /**     * Remove all credential avatars from this crawl uri.     * @param ca Avatar to remove.     * @return True if we removed passed parameter.  False if no operation     * performed.     */    public boolean removeCredentialAvatar(CredentialAvatar ca) {        boolean result = false;        Set avatars = getCredentialAvatars();        if (avatars != null && avatars.size() > 0) {            result = avatars.remove(ca);        }        return result;    }    /**     * Ask this URI if it was a success or not.     *     * Only makes sense to call this method after execution of     * HttpMethod#execute. Regard any status larger then 0 as success     * except for below caveat regarding 401s.  Use {@link #is2XXSuccess()} if     * looking for a status code in the 200 range.     *     * <p>401s caveat: If any rfc2617 credential data present and we got a 401     * assume it got loaded in FetchHTTP on expectation that we're to go around     * the processing chain again. Report this condition as a failure so we     * get another crack at the processing chain only this time we'll be making     * use of the loaded credential data.     *     * @return True if ths URI has been successfully processed.     * @see #is2XXSuccess()     */    public boolean isSuccess() {        boolean result = false;        int statusCode = this.fetchStatus;        if (statusCode == HttpStatus.SC_UNAUTHORIZED &&            hasRfc2617CredentialAvatar()) {            result = false;        } else {            result = (statusCode > 0);        }        return result;    }        /**     * @return True if status code is in the 2xx range.     * @see #isSuccess()     */    public boolean is2XXSuccess() {    	return this.fetchStatus >= 200 && this.fetchStatus < 300;    }    /**	 * @return True if we have an rfc2617 payload.	 */	public boolean hasRfc2617CredentialAvatar() {	    boolean result = false;	    Set avatars = getCredentialAvatars();	    if (avatars != null && avatars.size() > 0) {	        for (Iterator i = avatars.iterator(); i.hasNext();) {	            if (((CredentialAvatar)i.next()).	                match(Rfc2617Credential.class)) {	                result = true;	                break;	            }	        }	    }        return result;	}    /**     * Set whether this URI should be fetched by sending a HTTP POST request.     * Else a HTTP GET request will be used.     *     * @param b Set whether this curi is to be POST'd.  Else its to be GET'd.     */    public void setPost(boolean b) {        this.post = b;    }    /**     * Returns true if this URI should be fetched by sending a HTTP POST request.     *     *     * TODO: Compound this and {@link #isHttpTransaction()} method so that there     * is one place to go to find out if get http, post http, ftp, dns.     *     * @return Returns is this CrawlURI instance is to be posted.     */    public boolean isPost() {        return this.post;    }    /**     * Set the retained content-digest value (usu. SHA1).      *      * @param digestValue     * @deprecated Use {@link #setContentDigest(String scheme, byte[])}     */    public void setContentDigest(byte[] digestValue) {        setContentDigest("SHA1", digestValue);    }        public void setContentDigest(final String scheme,            final byte [] digestValue) {        this.contentDigest = digestValue;        this.contentDigestScheme = scheme;    }        public String getContentDigestSchemeString() {        if(this.contentDigest==null) {            return null;        }        return this.contentDigestScheme + ":" + getContentDigestString();    }    /**     * Return the retained content-digest value, if any.     *      * @return Digest value.     */    public Object getContentDigest() {        return contentDigest;    }        public String getContentDigestString() {        if(this.contentDigest==null) {            return null;        }        return Base32.encode(this.contentDigest);    }    transient Object holder;    transient Object holderKey;    /**     * Remember a 'holder' to which some enclosing/queueing     * facility has assigned this CrawlURI     * .     * @param obj     */    public void setHolder(Object obj) {        holder=obj;    }    /**     * Return the 'holder' for the convenience of      * an external facility.     *     * @return holder     */    public Object getHolder() {        return holder;    }    /**     * Remember a 'holderKey' which some enclosing/queueing     * facility has assigned this CrawlURI     * .     * @param obj     */    public void setHolderKey(Object obj) {        holderKey=obj;    }    /**     * Return the 'holderKey' for convenience of      * an external facility (Frontier).     *      * @return holderKey      */    public Object getHolderKey() {        return holderKey;    }    /**     * Get the ordinal (serial number) assigned at creation.     *      * @return ordinal     */    public long getOrdinal() {        return ordinal;    }    /** spot for an integer cost to be placed by external facility (frontier).     *  cost is truncated to 8 bits at times, so should not exceed 255 */    int holderCost = UNCALCULATED;    /**     * Return the 'holderCost' for convenience of external facility (frontier)     * @return value of holderCost     */    public int getHolderCost() {        return holderCost;    }    /**     * Remember a 'holderCost' which some enclosing/queueing     * facility has assigned this CrawlURI     * @param cost value to remember     */    public void setHolderCost(int cost) {        holderCost = cost;    }    /**      * All discovered outbound Links (navlinks, embeds, etc.)      * Can either contain Link instances or CandidateURI instances, or both.     * The LinksScoper processor converts Link instances in this collection     * to CandidateURI instances.      */    transient Collection<Object> outLinks = new HashSet<Object>();        /**     * Returns discovered links.  The returned collection might be empty if     * no links were discovered, or if something like LinksScoper promoted     * the links to CandidateURIs.     *      * Elements can be removed from the returned collection, but not added.     * To add a discovered link, use one of the createAndAdd methods or     * {@link #getOutObjects()}.     *      * @return Collection of all discovered outbound Links     */    public Collection<Link> getOutLinks() {        return Transform.subclasses(outLinks, Link.class);    }        /**     * Returns discovered candidate URIs.  The returned collection will be     * emtpy until something like LinksScoper promotes discovered Links     * into CandidateURIs.     *      * Elements can be removed from the returned collection, but not added.     * To add a candidate URI, use {@link #replaceOutlinks(Collection)} or     * {@link #getOutObjects}.     *      * @return  Collection of candidate URIs     */    public Collection<CandidateURI> getOutCandidates() {        return Transform.subclasses(outLinks, CandidateURI.class);    }            /**     * Returns all of the outbound objects.  The returned Collection will     * contain Link instances, or CandidateURI instances, or both.       *      * @return  the collection of Links and/or CandidateURIs     */    public Collection<Object> getOutObjects() {        return outLinks;    }        /**     * Add a discovered Link, unless it would exceed the max number     * to accept. (If so, increment discarded link counter.)      *      * @param link the Link to add     */    public void addOutLink(Link link) {        if (outLinks.size() < MAX_OUTLINKS) {            outLinks.add(link);        } else {            // note & discard            discardedOutlinks++;        }    }        public void clearOutlinks() {        this.outLinks.clear();    }        /**     * Replace current collection of links w/ passed list.     * Used by Scopers adjusting the list of links (removing those     * not in scope and promoting Links to CandidateURIs).     *      * @param a collection of CandidateURIs replacing any previously     *   existing outLinks or outCandidates     */    public void replaceOutlinks(Collection<CandidateURI> links) {        clearOutlinks();        this.outLinks.addAll(links);    }            /**     * @return Count of outlinks.     */    public int outlinksSize() {        return this.outLinks.size();    }    /**     * Convenience method for creating a Link discovered at this URI     * with the given string and context     *      * @param url     *            String to use to create Link     * @param context     *            CharSequence context to use     * @param hopType     * @return Link.     * @throws URIException     *             if Link UURI cannot be constructed     */    public Link createLink(String url, CharSequence context,            char hopType) throws URIException {        return new Link(getUURI(), UURIFactory.getInstance(getUURI(),                url), context, hopType);    }        /**     * Convenience method for creating a Link with the given string and     * context     *      * @param url     *            String to use to create Link     * @param context     *            CharSequence context to use     * @param hopType     * @throws URIException     *             if Link UURI cannot be constructed     */    public void createAndAddLink(String url, CharSequence context,            char hopType) throws URIException {        addOutLink(createLink(url, context, hopType));    }    /**     * Convenience method for creating a Link with the given string and     * context, relative to a previously set base HREF if available (or     * relative to the current CrawlURI if no other base has been set)     *      * @param url String URL to add as destination of link     * @param context String context where link was discovered     * @param hopType char hop-type indicator     * @throws URIException     */    public void createAndAddLinkRelativeToBase(String url,            CharSequence context, char hopType) throws URIException {        addOutLink(new Link(getUURI(), UURIFactory.getInstance(                getBaseURI(), url), context, hopType));    }        /**     * Convenience method for creating a Link with the given string and     * context, relative to this CrawlURI's via UURI if available. (If     * a via is not available, falls back to using      * #createAndAddLinkRelativeToBase.)     *      * @param url String URL to add as destination of link     * @param context String context where link was discovered     * @param hopType char hop-type indicator     * @throws URIException     */    public void createAndAddLinkRelativeToVia(String url,            CharSequence context, char hopType) throws URIException {        if(getVia()!=null) {            addOutLink(new Link(getUURI(), UURIFactory.getInstance(                getVia(), url), context, hopType));        } else {            // if no 'via', fall back to base/self            createAndAddLinkRelativeToBase(url,context,hopType);        }    }        /**     * Set the (HTML) Base URI used for derelativizing internal URIs.      *      * @param baseHref String base href to use     * @throws URIException if supplied string cannot be interpreted as URI     */    public void setBaseURI(String baseHref) throws URIException {        putObject(A_HTML_BASE, UURIFactory.getInstance(baseHref));    }          /**     * Get the (HTML) Base URI used for derelativizing internal URIs.      *     * @return UURI base URI previously set      */      public UURI getBaseURI() {        if (!containsKey(A_HTML_BASE)) {            return getUURI();        }        return (UURI)getObject(A_HTML_BASE);    }        /**     * Add the key of alist items you want to persist across     * processings.     * @param key Key to add.     */    public static void addAlistPersistentMember(Object key) {        alistPersistentMember.add(key);    }        /**     * @param key Key to remove.     * @return True if list contained the element.     */    public static boolean removeAlistPersistentMember(Object key) {        return alistPersistentMember.remove(key);    }    /**     * Custom serialization writing an empty 'outLinks' as null. Estimated     * to save ~20 bytes in serialized form.      *      * @param stream     * @throws IOException     */    private void writeObject(ObjectOutputStream stream) throws IOException {        stream.defaultWriteObject();        stream.writeObject((outLinks.isEmpty()) ? null : outLinks);    }    /**     * Custom deserialization recreating empty HashSet from null in 'outLinks'     * slot.      *      * @param stream     * @throws IOException     * @throws ClassNotFoundException     */    private void readObject(ObjectInputStream stream) throws IOException,            ClassNotFoundException {        stream.defaultReadObject();        @SuppressWarnings("unchecked")        HashSet<Object> ol = (HashSet<Object>) stream.readObject();        outLinks = (ol == null) ? new HashSet<Object>() : ol;    }}
上一页 1 23
💿 文件大小 10016 K
👤 上传用户 qqpp2q
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#Heritrix #robots #META #web
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -