⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 spider.java

📁 java高级使用教程 全书一共分六章
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
		// If it's not HTML we don't have to interpose our parser.
		return uc;
		}
	    }
	catch ( FileNotFoundException e )
	    {
	    brokenLink(
		myUrlToString( localItem.fromUrl ), localItem.thisUrlStr,
		e.getMessage() );
	    }
	catch ( UnknownHostException e )
	    {
	    brokenLink(
		myUrlToString( localItem.fromUrl ), localItem.thisUrlStr,
		"unknown host -- " + e.getMessage() );
	    }
	catch ( Exception e )
	    {
	    reportError(
		myUrlToString( localItem.fromUrl ), localItem.thisUrlStr,
		e.toString() );
	    }
	return null;
	}
    

    private String myUrlToString( URL url )
	{
	if ( url == null )
	    return "an initial URL";
	else
	    return url.toExternalForm();
	}


    /// Acme.HtmlObserver callback.
    public void gotAHREF( String urlStr, URL contextUrl, Object clientData )
	{
	add( urlStr, contextUrl, (SpiderItem) clientData );
	}

    /// Acme.HtmlObserver callback.
    public void gotIMGSRC( String urlStr, URL contextUrl, Object clientData )
	{
	add( urlStr, contextUrl, (SpiderItem) clientData );
	}

    /// Acme.HtmlObserver callback.
    public void gotFRAMESRC( String urlStr, URL contextUrl, Object clientData )
	{
	add( urlStr, contextUrl, (SpiderItem) clientData );
	}

    /// Acme.HtmlObserver callback.
    public void gotBASEHREF( String urlStr, URL contextUrl, Object clientData )
	{
	// Nothing.
	}

    /// Acme.HtmlObserver callback.
    public void gotAREAHREF( String urlStr, URL contextUrl, Object clientData )
	{
	add( urlStr, contextUrl, (SpiderItem) clientData );
	}

    /// Acme.HtmlObserver callback.
    public void gotLINKHREF( String urlStr, URL contextUrl, Object clientData )
	{
	add( urlStr, contextUrl, (SpiderItem) clientData );
	}

    /// Acme.HtmlObserver callback.
    public void gotBODYBACKGROUND( String urlStr, URL contextUrl, Object clientData )
	{
	add( urlStr, contextUrl, (SpiderItem) clientData );
	}

    private void add( String urlStr, URL contextUrl, SpiderItem item )
	{
	try
	    {
	    // Convert to no-ref, canonical form.
	    URL url = Acme.Utils.plainUrl( contextUrl, urlStr );
	    urlStr = url.toExternalForm();
	    // Add it.
	    addOne( urlStr, contextUrl, item );
	    // Also add all parent directories up to the root.  We'll get
	    // a lot of duplicates this way, but the hashtable will filter
	    // them out.
	    String rootUrlStr =
		( new URL( new URL( urlStr ), "/" ) ).toExternalForm();
	    while ( urlStr.length() > rootUrlStr.length() )
		{
		int lastSlash = urlStr.lastIndexOf( '/', urlStr.length() - 2 );
		urlStr = urlStr.substring( 0, lastSlash + 1 );
		addOne( urlStr, contextUrl, item );
		}
	    }
	catch ( MalformedURLException e )
	    {
	    String msg = e.getMessage();
	    if ( checkMalformedURL( msg ) )
		brokenLink( myUrlToString( contextUrl ), urlStr, msg );
	    }
	}
    
    private void addOne( String urlStr, URL contextUrl, SpiderItem item )
	{
	// Check if we've already done this one.
	if ( ! done.containsKey( urlStr ) )
	    {
	    // Check if we should do this one.
	    if ( doThisUrl( urlStr, item.depth + 1, item.baseUrlStr ) )
		{
		// Yes.
		if ( todoLimit == 0 || todo.size() < todoLimit )
		    todo.addBack( new SpiderItem(
			urlStr, contextUrl, item.depth + 1,
			item.baseUrlStr ) );
		}
	    }
	}


    /// Test program.  Shows URLs, file sizes, etc. at the ACME Java site.
    public static void main( String[] args )
	{
	if ( args.length != 1 )
	    {
	    System.err.println( "usage: Spider URL" );
	    return;
	    }
	Enumeration as;
	try
	    {
	    // as = new Spider( args[0] );
	    as = new Spider( args[0], System.err );
	    }
	catch ( MalformedURLException e )
	    {
	    System.err.println( e );
	    return;
	    }

	while ( as.hasMoreElements() )
	    {
	    URLConnection uc = (URLConnection) as.nextElement();
	    URL thisUrl = uc.getURL();
	    String thisUrlStr = thisUrl.toExternalForm();
	    String mimeType = uc.getContentType();
	    int bytes = 0;
	    try
		{
		InputStream s = uc.getInputStream();
		while ( s.read() != -1 )
		    ++bytes;
		s.close();
		}
	    catch ( IOException e ) {}
	    System.out.println( thisUrlStr + " " + mimeType + " " + bytes );
	    }
	}

    }


// A struct class to hold info on a queued URL.
class SpiderItem
    {
    String thisUrlStr;
    URL fromUrl;
    int depth;
    String baseUrlStr;

    public SpiderItem(
	String thisUrlStr, URL fromUrl, int depth, String baseUrlStr )
	{
	this.thisUrlStr = thisUrlStr;
	this.fromUrl = fromUrl;
	this.depth = depth;
	this.baseUrlStr = baseUrlStr;
	}
    }


// SpiderConnection - utility class for Spider
//
// A SpiderConnection is the type returned by Spider.  It's a
// URLConnection, slightly modified internally to work with Spider.

class SpiderConnection extends URLConnection
    {
    private URLConnection uc;
    protected InputStream s;

    public SpiderConnection( URLConnection uc ) throws IOException
	{
	super( uc.getURL() );
	this.uc = uc;
	this.s = uc.getInputStream();
	}

    public SpiderConnection( URLConnection uc, InputStream s )
	{
	super( uc.getURL() );
	this.uc = uc;
	this.s = s;
	}

    public InputStream getInputStream() throws IOException
	{
	return s;
	}

    // The rest just forward to uc's methods.
    final public void connect() throws IOException
	{
	uc.connect();
	}
    final public URL getURL()
	{
	return uc.getURL();
	}
    final public int getContentLength()
	{
	return uc.getContentLength();
	}
    final public String getContentType()
	{
	return uc.getContentType();
	}
    final public String getContentEncoding()
	{
	return uc.getContentEncoding();
	}
    final public long getExpiration()
	{
	return uc.getExpiration();
	}
    final public long getDate()
	{
	return uc.getDate();
	}
    final public long getLastModified()
	{
	return uc.getLastModified();
	}
    final public String getHeaderField( String name )
	{
	return uc.getHeaderField( name );
	}
    final public int getHeaderFieldInt( String name, int Default )
	{
	return uc.getHeaderFieldInt( name, Default );
	}
    final public long getHeaderFieldDate( String name, long Default )
	{
	return uc.getHeaderFieldDate( name, Default );
	}
    final public String getHeaderFieldKey( int n )
	{
	return uc.getHeaderFieldKey( n );
	}
    final public String getHeaderField( int n )
	{
	return uc.getHeaderField( n );
	}
    final public Object getContent() throws IOException
	{
	return uc.getContent();
	}
    final public OutputStream getOutputStream() throws IOException
	{
	return uc.getOutputStream();
	}
    final public String toString()
	{
	return uc.toString();
	}
    final public void setDoInput( boolean doinput )
	{
	uc.setDoInput( doinput );
	}
    final public boolean getDoInput()
	{
	return uc.getDoInput();
	}
    final public void setDoOutput( boolean dooutput )
	{
	uc.setDoOutput( dooutput );
	}
    final public boolean getDoOutput()
	{
	return uc.getDoOutput();
	}
    final public void setAllowUserInteraction( boolean allowuserinteraction )
	{
	uc.setAllowUserInteraction( allowuserinteraction );
	}
    final public boolean getAllowUserInteraction()
	{
	return uc.getAllowUserInteraction();
	}
    final public static void setDefaultAllowUserInteraction(
	boolean defaultallowuserinteraction )
	{
	URLConnection.setDefaultAllowUserInteraction(
	    defaultallowuserinteraction );
	}
    final public static boolean getDefaultAllowUserInteraction()
	{
	return URLConnection.getDefaultAllowUserInteraction();
	}
    final public void setUseCaches( boolean usecaches )
	{
	uc.setUseCaches( usecaches );
	}
    final public boolean getUseCaches()
	{
	return uc.getUseCaches();
	}
    final public void setIfModifiedSince( long ifmodifiedsince )
	{
	uc.setIfModifiedSince( ifmodifiedsince );
	}
    final public long getIfModifiedSince()
	{
	return uc.getIfModifiedSince();
	}
    final public boolean getDefaultUseCaches()
	{
	return uc.getDefaultUseCaches();
	}
    final public void setDefaultUseCaches( boolean defaultusecaches )
	{
	uc.setDefaultUseCaches( defaultusecaches );
	}
    final public void setRequestProperty( String key, String value )
	{
	uc.setRequestProperty( key, value );
	}
    final public String getRequestProperty( String key )
	{
	return uc.getRequestProperty( key );
	}
    final public static void setDefaultRequestProperty(
	String key, String value )
	{
	URLConnection.setDefaultRequestProperty( key, value );
	}
    final public static String getDefaultRequestProperty( String key )
	{
	return URLConnection.getDefaultRequestProperty( key );
	}
    final public static void setContentHandlerFactory(
	ContentHandlerFactory fac )
	{
	URLConnection.setContentHandlerFactory( fac );
	}
    }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -