📄 crawler.java

📁 一个用java语言编写的网络爬虫程序
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
    /**     * Callback for visiting a page.  Default version does nothing.     *     * @param page Page retrieved by the crawler     */    public void visit (Page page) {    }    /**     * Callback for testing whether a link should be traversed.     * Default version returns true for all links. Override this method     * for more interesting behavior.     *     * @param l Link encountered by the crawler     * @return true if link should be followed, false if it should be ignored.     */    public boolean shouldVisit (Link l) {        return true;    }    /**      * Expand the crawl from a page.  The default implementation of this     * method tests every link on the page using shouldVisit (), and      * submit()s the links that are approved.  A subclass may want to override     * this method if it's inconvenient to consider the links individually      * with shouldVisit().     * @param page Page to expand     */    public void expand (Page page) {         // examine each link on the page        Link[] links = page.getLinks();        if (links != null && links.length > 0) {            // give each link a default priority based on its page            // and position on page            float priority = (depthFirst ? -numPagesVisited : numPagesVisited);            float increment = 1.0f/links.length;            for (int i=0;  i<links.length; ++i) {                Link l = links[i];                // set default download parameters                l.setPriority (priority);                priority += increment;                l.setDownloadParameters (dp);                ++numLinksTested;                if (ignoreVisitedLinks && visited (l))                    // FIX: use atomic test-and-set                    // FIX: set l.page somehow?                    sendLinkEvent (l, LinkEvent.ALREADY_VISITED);                else if (!((type == null || l.hasAnyLabels (type))                           && (domain == null || l.hasAnyLabels (domain))                           && (linkPredicate == null || linkPredicate.shouldVisit (l))                           && shouldVisit (l)))                    sendLinkEvent (l, LinkEvent.SKIPPED);                else if (page.getDepth() >= maxDepth)                    sendLinkEvent (l, LinkEvent.TOO_DEEP);                else                    submit (l);            }        }    }    /*     * Crawl statistics     */    /**     * Get number of pages visited.     * @return number of pages passed to visit() so far in this crawl     */    public int getPagesVisited() {        return numPagesVisited;    }    /**     * Get number of links tested.     * @return number of links passed to shouldVisit() so far in this crawl     */    public int getLinksTested() {        return numLinksTested;    }    /**     * Get number of pages left to be visited.     * @return number of links approved by shouldVisit() but not yet visited     */    public int getPagesLeft() {        return numPagesLeft;    }    /**     * Get number of threads currently working.     * @return number of threads downloading pages     */    public int getActiveThreads () {        Worm[] w = worms;                if (w == null)            return 0;                    int n = 0;        for (int i=0; i<w.length; ++i)            if (w[i] != null && w[i].link != null)                ++n;                        return n;    }    /*     * Crawler parameters     */    /**     * Get human-readable name of crawler.  Default value is the     * class name, e.g., "Crawler".  Useful for identifying the crawler in a     * user interface; also used as the default User-agent for identifying     * the crawler to a remote Web server.  (The User-agent can be     * changed independently of the crawler name with setDownloadParameters().)     * @return human-readable name of crawler     */    public String getName () {        return name;    }    /**     * Set human-readable name of crawler.     * @param name new name for crawler     */    public void setName (String name) {        this.name = name;    }    /**     * Convert the crawler to a String.     * @return Human-readable name of crawler.     */    public String toString () {        return getName ();    }    /**     * Get starting points of crawl as an array of Link objects.     * @return array of Links from which crawler will start its next crawl.     */    public Link[] getRoots () {        if (roots == null)            return new Link[0];                    Link[] result = new Link[roots.length];        System.arraycopy (roots, 0, result, 0, roots.length);        return result;    }    /**     * Get roots of last crawl.  May differ from getRoots()      * if new roots have been set.     * @return array of Links from which crawler started its last crawl,     * or null if the crawler was cleared.     */    public Link[] getCrawledRoots () {        if (crawledRoots == null)            return null;                    Link[] result = new Link[crawledRoots.length];        System.arraycopy (crawledRoots, 0, result, 0, crawledRoots.length);        return result;    }    /**     * Get starting points of crawl as a String of newline-delimited URLs.     * @return URLs where crawler will start, separated by newlines.     */    public String getRootHrefs () {        StringBuffer buf = new StringBuffer ();        if (roots != null) {            for (int i=0; i<roots.length; ++i) {                if (buf.length() > 0)                    buf.append ('\n');                buf.append (roots[i].getURL().toExternalForm());            }        }        return buf.toString ();    }    /**     * Set starting points of crawl as a string of whitespace-delimited URLs.     * @param hrefs URLs of starting point, separated by space, \t, or \n     * @exception java.net.MalformedURLException if any of the URLs is invalid,     *    leaving starting points unchanged     */    public void setRootHrefs (String hrefs) throws MalformedURLException {        Vector v = new Vector ();        StringTokenizer tok = new StringTokenizer (hrefs);                while (tok.hasMoreElements ())            v.addElement (new Link (tok.nextToken()));        roots = new Link[v.size()];        v.copyInto (roots);    }    /**     * Set starting point of crawl as a single Link.     * @param link starting point     */    public void setRoot (Link link) {        roots = new Link[1];        roots[0] = link;    }    /**     * Set starting points of crawl as an array of Links.     * @param links starting points     */    public void setRoots (Link[] links) {        roots = new Link[links.length];        System.arraycopy (links, 0, roots, 0, links.length);    }    /**     * Add a root to the existing set of roots.     * @param link starting point to add     */    public void addRoot (Link link) {        if (roots == null)            setRoot (link);        else {            Link newroots[] = new Link[roots.length+1];            System.arraycopy (roots, 0, newroots, 0, roots.length);            newroots[newroots.length-1] = link;            roots = newroots;        }    }    /**     * Get crawl domain.  Default value is WEB.     * @return WEB, SERVER, or SUBTREE.     */    public String[] getDomain () {        return domain;    }    /**     * Set crawl domain.     * @param domain one of WEB, SERVER, or SUBTREE.     */    public void setDomain (String[] domain) {        this.domain = domain;    }    /**     * Get legal link types to crawl.  Default value is HYPERLINKS.     * @return HYPERLINKS, HYPERLINKS_AND_IMAGES, or ALL_LINKS.     */    public String[] getLinkType () {        return type;    }    /**     * Set legal link types to crawl.     * @param domain one of HYPERLINKS, HYPERLINKS_AND_IMAGES, or ALL_LINKS.     */    public void setLinkType (String[] type) {        this.type = type;    }    /**     * Get depth-first search flag.  Default value is true.     * @return true if search is depth-first, false if search is breadth-first.     */    public boolean getDepthFirst() {        return depthFirst;    }    /**     * Set depth-first search flag.  If neither depth-first nor breadth-first     * is desired, then override shouldVisit() to set a custom priority on     * each link.     * @param useDFS true if search should be depth-first, false if search should be breadth-first.     */    public void setDepthFirst(boolean useDFS) {        depthFirst = useDFS;    }    /**     * Get synchronous flag.  Default value is false.     * @return true if crawler must visit the pages in priority order; false if crawler can visit      * pages in any order.     */    public boolean getSynchronous() {        return synchronous;    }    /**     * Set ssynchronous flag.     * @param f true if crawler must visit the pages in priority order; false if crawler can visit      * pages in any order.     */    public void setSynchronous(boolean f) {        synchronous = f;    }    /**     * Get ignore-visited-links flag.  Default value is true.     * @return true if search skips links whose URLs have already been visited     * (or queued for visiting).     */    public boolean getIgnoreVisitedLinks() {        return ignoreVisitedLinks;    }    /**     * Set ignore-visited-links flag.     * @param f true if search skips links whose URLs have already been visited     * (or queued for visiting).     */    public void setIgnoreVisitedLinks(boolean f) {        ignoreVisitedLinks = f;    }    /**     * Get maximum depth.  Default value is 5.     * @return maximum depth of crawl, in hops from starting point.     */    public int getMaxDepth() {        return maxDepth;    }    /**     * Set maximum depth.     * @param maxDepth maximum depth of crawl, in hops from starting point     */    public void setMaxDepth(int maxDepth) {        this.maxDepth = maxDepth;    }    /**     * Get download parameters (such as number of threads, timeouts, maximum     * page size, etc.)     */    public DownloadParameters getDownloadParameters() {        return dp;    }    /**     * Set download parameters  (such as number of threads, timeouts, maximum     * page size, etc.)     * @param dp Download parameters     */    public void setDownloadParameters(DownloadParameters dp) {        this.dp = dp;    }    /**     * Set link predicate.  This is an alternative way to     * specify the links to walk.  If the link predicate is     * non-null, then only links that satisfy     * the link predicate AND shouldVisit() are crawled.     * @param pred Link predicate     */    public void setLinkPredicate (LinkPredicate pred) {        if (pred == linkPredicate            || (pred != null && pred.equals (linkPredicate)))            return;        if (linkPredicate != null)            linkPredicate.disconnected (this);        linkPredicate = pred;        if (linkPredicate != null)            linkPredicate.connected (this);    }    /**     * Get link predicate.     * @return current link predicate     */    public LinkPredicate getLinkPredicate () {        return linkPredicate;    }    /**     * Set page predicate.  This is a way to filter the pages     * passed to visit().  If the page predicate is     * non-null, then only pages that satisfy it are passed to visit().     * @param pred Page predicate     */    public void setPagePredicate (PagePredicate pred) {        if (pred == pagePredicate            || (pred != null && pred.equals (pagePredicate)))            return;        if (pagePredicate != null)            pagePredicate.disconnected (this);        pagePredicate = pred;        if (pagePredicate != null)            pagePredicate.connected (this);    }    /**     * Get page predicate.     * @return current page predicate     */    public PagePredicate getPagePredicate () {        return pagePredicate;    }    /**     * Set the action.  This is an alternative way to specify     * an action performed on every page.  If act is non-null,     * then every page passed to visit() is also passed to this     * action.     * @param act Action     */    public void setAction (Action act) {        if (act == action            || (act != null && act.equals (action)))            return;        if (action != null)            action.disconnected (this);        action = act;        if (action != null)            action.connected (this);    }    /**     * Get action.     * @return current action     */    public Action getAction () {        return action;    }    /*     * Link queue management     *     */    /**     * Puts a link into the crawling queue.  If the crawler is running, the     * link will eventually be retrieved and passed to visit().     * @param link Link to put in queue     */    public void submit (Link link) {        markVisited (link); // FIX: need atomic test-and-set of visited flag        synchronized (crawlQueue) {            synchronized (fetchQueue) {                crawlQueue.put (link);                ++numPagesLeft;                fetchQueue.put (link);                fetchQueue.notifyAll ();  // wake up worms            }
💿 文件大小 602 K
👤 上传用户 jwl119
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#java #语言 #编写 #程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -