📄 spider.java

📁 java高级使用教程全书一共分六章
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
// Spider - a web-robot class
//
// Copyright (C) 1996 by Jef Poskanzer <jef@acme.com>.  All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
// SUCH DAMAGE.
//
// Visit the ACME Labs Java page for up-to-date versions of this and other
// fine Java utilities: http://www.acme.com/java/

package Acme;

import java.util.*;
import java.net.*;
import java.io.*;

/// A web-robot class.
// <P>
// This is an Enumeration class that traverses the web starting at
// a given URL.  It fetches HTML files and parses them for new
// URLs to look at.  All files it encounters, HTML or otherwise,
// are returned by the nextElement() method as a URLConnection.
// <P>
// The traversal is breadth-first, and by default it is limited to
// files at or below the starting point - same protocol, hostname, and
// initial directory.
// <P>
// Because of the security restrictions on applets, this is currently
// only useful from applications.
// <P>
// Sample code:
// <BLOCKQUOTE><PRE>
// Enumeration spider = new Acme.Spider( "http://some.site.com/whatever/" );
// while ( spider.hasMoreElements() )
//     {
//     URLConnection conn = (URLConnection) spider.nextElement();
//     // Then do whatever you like with conn:
//     URL thisUrl = conn.getURL();
//     String thisUrlStr = thisUrl.toExternalForm();
//     String mimeType = conn.getContentType();
//     long changed = conn.getLastModified();
//     InputStream s = conn.getInputStream();
//     // Etc. etc. etc., your code here.
//     }
// </PRE></BLOCKQUOTE>
// There are also a couple of methods you can override via a subclass, to
// control things like the search limits and what gets done with broken links.
// <P>
// Sample applications that use Acme.Spider:
// <UL>
// <LI> <A HREF="WebList.html">WebList</A> - make a list of the files in a web subtree
// <LI> <A HREF="WebCopy.html">WebCopy</A> - copy a remote web subtree to the local disk
// <LI> <A HREF="WebGrep.html">WebGrep</A> - grep a web subtree for a pattern
// </UL>
// <P>
// <A HREF="/resources/classes/Acme/Spider.java">Fetch the software.</A><BR>
// <A HREF="/resources/classes/Acme.tar.gz">Fetch the entire Acme package.</A>
// <P>
// @see Acme.HtmlScanner
// @see Acme.NoRobots

public class Spider implements Acme.HtmlObserver, Enumeration
    {

    // Used only in the default error-reporting routines.  If you override
    // them, this is not used at all.
    /*private*/ protected PrintStream err;

    // The spider's state.
    /*private*/ protected Queue todo = new Queue();
    private int todoLimit = 0;
    /*private*/ protected Hashtable done;

    // Temporary state used only between when hasMoreElements and
    // nextElement are called.  Multi-threaded callers should be careful.
    // This seems to be an inherent problem with using Enumeration in
    // a multi-threaded context.
    private SpiderItem item;
    private URL thisUrl;

    // The optional authorization cookie.
    private String auth_cookie = null;

    // The list of Acme.HtmlObservers to add to the HtmlScanner.
    private Vector observers = new Vector();

    // The user-agent name of this spider.
    private final String spiderName = "Acme.Spider";

    // The Robot Exclusion checker.
    private Acme.NoRobots noRobots = new Acme.NoRobots( spiderName );


    /// Constructor with no size limits.
    // @param err the error stream
    public Spider( PrintStream err )
	{
	this.err = err;
	done = new Hashtable();
	}

    /// Constructor with no size limits, and the default error stream.
    public Spider()
	{
	this( System.err );
	}

    /// Constructor with a single URL and no size limits.
    // @param urlStr the URL to start off the enumeration
    // @param err the error stream
    public Spider( String urlStr, PrintStream err ) throws MalformedURLException
	{
	this( err );
	addUrl( urlStr );
	}

    /// Constructor with a single URL and no size limits, and the default
    // error stream.
    // @param urlStr the URL to start off the enumeration
    public Spider( String urlStr ) throws MalformedURLException
	{
	this( urlStr, System.err );
	}

    /// Constructor with size limits.
    // This version lets you specify limits on the todo queue and the
    // done hash-table.  If you are using Spider for a large, multi-site
    // traversal, then you may need to set these limits to avoid running
    // out of memory.  Note that setting a todoLimit means the traversal
    // will not be complete - you may skip some URLs.  And setting the
    // doneLimit means it may re-visit some pages.
    // <P>
    // Guesses at good values for an unlimited traversal: 200000 and 20000.
    // You want the doneLimit pretty small because the hash-table gets checked
    // for every URL, so it will be mostly in memory; the todo queue, on the
    // other hand, is only accessed at the front and back, and so will be
    // mostly paged out.
    // @param urlStr the URL to start off the enumeration
    // @param todoLimit maximum number of URLs to queue for examination
    // @param doneLimit maximum number of URLs to remember having done already
    // @param err the error stream
    public Spider( int todoLimit, int doneLimit, PrintStream err )
	{
	this.err = err;
	this.todoLimit = todoLimit;
	if ( doneLimit == 0 )
	    done = new Hashtable();
	else
	    done = new Acme.LruHashtable( doneLimit );
	}

    /// Constructor with size limits.
    // @param urlStr the URL to start off the enumeration
    // @param todoLimit maximum number of URLs to queue for examination
    // @param doneLimit maximum number of URLs to remember having done already
    public Spider( int todoLimit, int doneLimit )
	{
	this( todoLimit, doneLimit, System.err );
	}


    /// Add a URL to the to-do list.
    public synchronized void addUrl( String urlStr ) throws MalformedURLException
	{
	URL url = Acme.Utils.plainUrl( urlStr );
	String thisUrlStr = url.toExternalForm();
	String baseUrlStr = Acme.Utils.baseUrlStr( thisUrlStr );
	todo.addBack( new SpiderItem( thisUrlStr, null, 0, baseUrlStr ) );
	}

    /// Set the authorization cookie.
    // <P>
    // Syntax is userid:password.
    public synchronized void setAuth( String auth_cookie )
	{
	this.auth_cookie = auth_cookie;
	}

    /// Add an extra observer to the scanners we make.  Multiple observers
    // get called in the order they were added.
    // <P>
    // Alternatively, if you want to add a different observer to each
    // scanner, you can cast the input stream to a scanner and call
    // its add routine, like so:
    // <BLOCKQUOTE><CODE><PRE>
    // InputStream s = conn.getInputStream();
    // Acme.HtmlScanner scanner = (Acme.HtmlScanner) s;
    // scanner.addObserver( this );
    // </PRE></CODE></BLOCKQUOTE>
    public synchronized void addObserver( Acme.HtmlObserver observer )
	{
	observers.addElement( observer );
	}


    /// This method can be overridden by a subclass if you want to change
    // the search policy.  The default version only does URLs that start
    // with the same string as the base URL.  An alternate version might
    // instead go by the search depth.
    protected boolean doThisUrl( String thisUrlStr, int depth, String baseUrlStr )
	{
	if ( thisUrlStr.startsWith( baseUrlStr ) )
	    return true;
	return false;
	}

    /// This method can be overridden by a subclass if you want to change
    // the broken link policy.  The default version reports the broken
    // link on the error stream.  An alternate version might attempt to
    // send mail to the owner of the page with the broken link.
    protected void brokenLink( String fromUrlStr, String toUrlStr, String errmsg )
	{
	err.println( "Broken link in " + fromUrlStr );
	err.println( "    pointing to " +toUrlStr );
	err.println( "    " + errmsg );
	}

    /// This method can be overridden by a subclass if you want to change
    // the error reporting policy.  The default version reports the error
    // link on the error stream.  An alternate version might ignore the error.
    protected void reportError( String fromUrlStr, String toUrlStr, String errmsg )
	{
	err.println( "Error in " + fromUrlStr );
	err.println( "    pointing to " +toUrlStr );
	err.println( "    " + errmsg );
	}

    
    private boolean gotOne = false;

    // Get the next file, if possible.
    private synchronized void getOne()
	{
	while ( ! todo.isEmpty() )
	    {
	    item = (SpiderItem) todo.getFront();

	    // Check again if we've already done this one.
	    if ( ! done.containsKey( item.thisUrlStr ) )
		{
		done.put( item.thisUrlStr, item.thisUrlStr );
		try
		    {
		    thisUrl = new URL( item.thisUrlStr );
		    if ( ! noRobots.ok( thisUrl ) )
			continue;
		    gotOne = true;
		    return;
		    }
		catch ( MalformedURLException e )
		    {
		    String msg = e.getMessage();
		    if ( checkMalformedURL( msg ) )
			brokenLink(
			    myUrlToString( item.fromUrl ), item.thisUrlStr,
			    msg );
		    }
		catch ( Exception e )
		    {
		    reportError(
			myUrlToString( item.fromUrl ), item.thisUrlStr,
			e.toString() );
		    }
		}

	    }

	gotOne = false;
	return;
	}
    
    private static boolean checkMalformedURL( String msg )
	{
	// Java is missing protocol handlers for many common
	// protocols.  Ignore those errors.
	if ( msg.startsWith( "unknown protocol: " ) )
	    {
	    String protocol = msg.substring( msg.lastIndexOf( ' ' ) + 1 );
	    if ( protocol.equalsIgnoreCase( "gopher" ) ||
	         protocol.equalsIgnoreCase( "ftp" ) ||
	         protocol.equalsIgnoreCase( "file" ) ||
	         protocol.equalsIgnoreCase( "telnet" ) ||
	         protocol.equalsIgnoreCase( "news" ) ||
	         protocol.equalsIgnoreCase( "mailto" ) ||
	         protocol.equalsIgnoreCase( "javascript" ) )
		return false;
	    }
	return true;
	}

    /// Standard Enumeration method.
    public synchronized boolean hasMoreElements()
	{
	if ( ! gotOne )
	    getOne();
	return gotOne;
	}


    /// Standard Enumeration method.
    public synchronized Object nextElement()
	{
	if ( ! gotOne )
	    getOne();
	if ( ! gotOne )
	    return null;
	gotOne = false;

	// Make local copies of the temporary global state variables, so the
	// window for overwriting them is smaller.
	SpiderItem localItem = item;
	URL localThisUrl = thisUrl;

	try
	    {
	    URLConnection uc = localThisUrl.openConnection();
	    uc.setRequestProperty( "User-Agent", spiderName );
	    if ( auth_cookie != null )
		uc.setRequestProperty( "Authorization", "Basic " + Acme.Utils.base64Encode( auth_cookie ) );
	    uc.connect();
	    InputStream s = uc.getInputStream();
	    String contentType = uc.getContentType();
	    if ( contentType != null && contentType.startsWith( "text/html" ) )
		{
		// Make a scanner, and pass in the SpiderItem as the clientData.
		HtmlScanner scanner = new HtmlScanner(
		    s, localThisUrl, this, localItem );

		// Add any extra observers.
		Enumeration en = observers.elements();
		while ( en.hasMoreElements() )
		    {
		    Acme.HtmlObserver observer =
			(Acme.HtmlObserver) en.nextElement();
		    scanner.addObserver( observer );
		    }

		// And make a URLConnection that uses this scanner.
		SpiderConnection asc = new SpiderConnection( uc, scanner );
		return asc;
		}
	    else
		{
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -