📄 candidateuri.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * * CandidateURI.java * Created on Sep 30, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/datamodel/CandidateURI.java,v 1.49 2006/08/15 00:25:02 paul_jack Exp $ */package org.archive.crawler.datamodel;import java.io.IOException;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.io.PrintWriter;import java.io.Serializable;import java.util.ArrayList;import java.util.Iterator;import java.util.List;import org.apache.commons.httpclient.URIException;import org.archive.crawler.extractor.Link;import org.archive.net.UURI;import org.archive.net.UURIFactory;import org.archive.util.ArchiveUtils;import org.archive.util.Reporter;import st.ata.util.AList;import st.ata.util.HashtableAList;/** * A URI, discovered or passed-in, that may be scheduled. * When scheduled, a CandidateURI becomes a {@link CrawlURI} * made with the data contained herein. A CandidateURI * contains just the fields necessary to perform quick in-scope analysis. *  * <p>Has a flexible attribute list that will be promoted into * any {@link CrawlURI} created from this CandidateURI.  Use it * to add custom data or state needed later doing custom processing. * See accessors/setters {@link #putString(String, String)}, * {@link #getString(String)}, etc.  * * @author Gordon Mohr */public class CandidateURIimplements Serializable, Reporter, CoreAttributeConstants {    private static final long serialVersionUID = -7152937921526560388L;    /** Highest scheduling priority.     * Before any others of its class.     */    public static final int HIGHEST = 0;        /** High scheduling priority.     * After any {@link #HIGHEST}.     */    public static final int HIGH = 1;        /** Medium priority.     * After any {@link #HIGH}.     */    public static final int MEDIUM = 2;        /** Normal/low priority.     * Whenever/end of queue.     */    public static final int NORMAL = 3;        private int schedulingDirective = NORMAL;        /**      * Usuable URI under consideration. Transient to allow     * more efficient custom serialization      */    private transient UURI uuri;        /** Seed status */    private boolean isSeed = false;    private boolean forceRevisit = false; // even if already visited        /** String of letters indicating how this URI was reached from a seed.     * <pre>     * P precondition     * R redirection     * E embedded (as frame, src, link, codebase, etc.)     * X speculative embed (as from javascript, some alternate-format extractors     * L link</pre>     * For example LLLE (an embedded image on a page 3 links from seed).     */    private String pathFromSeed;        /**     * Where this URI was (presently) discovered. . Transient to allow     * more efficient custom serialization     */    private transient UURI via;    /**     * Context of URI's discovery, as per the 'context' in Link     */    private CharSequence viaContext;        /**     * Flexible dynamic attributes list.     * <p>     * The attribute list is a flexible map of key/value pairs for storing     * status of this URI for use by other processors. By convention the     * attribute list is keyed by constants found in the     * {@link CoreAttributeConstants} interface.  Use this list to carry     * data or state produced by custom processors rather change the     * classes {@link CrawlURI} or this class, CandidateURI.     *     * Transient to allow more efficient custom serialization.     */    private transient AList alist;        /**     * Cache of this candidate uuri as a string.     *     * Profiling shows us spending about 1-2% of total elapsed time in     * toString.     */    private String cachedCandidateURIString = null;        /**     * Frontier/Scheduler lifecycle info.     * This is an identifier set by the Frontier for its     * purposes. Usually its the name of the Frontier queue     * this URI gets queued to.  Values can be host + port     * or IP, etc.     */    private String classKey;    /**     * Constructor.     * Protected access to block access to default constructor.     */    protected CandidateURI () {        super();    }        /**     * @param u uuri instance this CandidateURI wraps.     */    public CandidateURI(UURI u) {        this.uuri = u;    }        /**     * @param u uuri instance this CandidateURI wraps.     * @param pathFromSeed     * @param via     * @param viaContext     */    public CandidateURI(UURI u, String pathFromSeed, UURI via,            CharSequence viaContext) {        this.uuri = u;        this.pathFromSeed = pathFromSeed;        this.via = via;        this.viaContext = viaContext;    }    /**     * Set the <tt>isSeed</tt> attribute of this URI.     * @param b Is this URI a seed, true or false.     */    public void setIsSeed(boolean b) {        this.isSeed = b;        if (this.isSeed) {            if(pathFromSeed==null) {                this.pathFromSeed = "";            }//          seeds created on redirect must have a via to be recognized; don't clear//            setVia(null);        }    }    /**     * @return UURI     */    public UURI getUURI() {        return this.uuri;    }    /**     * @return Whether seeded.     */    public boolean isSeed() {        return this.isSeed;    }    /**     * @return path (hop-types) from seed     */    public String getPathFromSeed() {        return this.pathFromSeed;    }    /**     * @return URI via which this one was discovered     */    public UURI getVia() {        return this.via;    }    /**     * @return CharSequence context in which this one was discovered     */    public CharSequence getViaContext() {        return this.viaContext;    }        /**     * @param string     */    protected void setPathFromSeed(String string) {        pathFromSeed = string;    }        /**     * Called when making a copy of another CandidateURI.     * @param alist AList to use.     */    protected void setAList(AList alist) {        this.alist = alist;    }    public void setVia(UURI via) {        this.via = via;    }    /**     * @return This candidate URI as a string wrapped with 'CandidateURI(' +     * ')'.     */    public synchronized String getCandidateURIString() {        if (this.cachedCandidateURIString == null) {            this.cachedCandidateURIString =                "CandidateURI(" + toString() + ")";        }        return this.cachedCandidateURIString;    }    /**     * Method returns string version of this URI's referral URI.     * @return String version of referral URI     */    public String flattenVia() {        return (via == null)? "": via.toString();    }        /**     * @return The UURI this CandidateURI wraps as a string      * (We used return what {@link #getCandidateURIString()}     * returns on a toString -- use that method if you still need     * this functionality).     * @see #getCandidateURIString()     */    public String toString() {        return getURIString();    }    /**     * @return URI String     * @deprecated Use {@link #toString()}.     */    public String getURIString() {        return getUURI().toString();    }    /**     * Compares the domain of this CandidateURI with that of another     * CandidateURI     *     * @param other The other CandidateURI     *     * @return True if both are in the same domain, false otherwise.     * @throws URIException     */    public boolean sameDomainAs(CandidateURI other) throws URIException {        String domain = getUURI().getHost();        if (domain == null) {            return false;        }        while(domain.lastIndexOf('.') > domain.indexOf('.')) {            // While has more than one dot, lop off first segment            domain = domain.substring(domain.indexOf('.') + 1);        }        if(other.getUURI().getHost() == null) {            return false;        }        return other.getUURI().getHost().endsWith(domain);    }    /**     * If this method returns true, this URI should be fetched even though     * it already has been crawled. This also implies     * that this URI will be scheduled for crawl before any other waiting     * URIs for the same host.     *     * This value is used to refetch any expired robots.txt or dns-lookups.     *     * @return true if crawling of this URI should be forced     */    public boolean forceFetch() {        return forceRevisit;    }   /**     * Method to signal that this URI should be fetched even though     * it already has been crawled. Setting this to true also implies     * that this URI will be scheduled for crawl before any other waiting     * URIs for the same host.     *     * This value is used to refetch any expired robots.txt or dns-lookups.     *     * @param b set to true to enforce the crawling of this URI     */    public void setForceFetch(boolean b) {        forceRevisit = b;    }    /**     * @return Returns the schedulingDirective.     */    public int getSchedulingDirective() {        return schedulingDirective;    }    /**      * @param schedulingDirective The schedulingDirective to set.     */    public void setSchedulingDirective(int schedulingDirective) {        this.schedulingDirective = schedulingDirective;    }    /**     * @return True if needs immediate scheduling.     */    public boolean needsImmediateScheduling() {        return schedulingDirective == HIGH;
12 下一页
💿 文件大小 18588 K
👤 上传用户 bonylee_java
📂 所属分类 Jsp/Servlet
🏷️ 相关标签

#工程
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -