📄 spider.java
字号:
// If it's not HTML we don't have to interpose our parser.
return uc;
}
}
catch ( FileNotFoundException e )
{
brokenLink(
myUrlToString( localItem.fromUrl ), localItem.thisUrlStr,
e.getMessage() );
}
catch ( UnknownHostException e )
{
brokenLink(
myUrlToString( localItem.fromUrl ), localItem.thisUrlStr,
"unknown host -- " + e.getMessage() );
}
catch ( Exception e )
{
reportError(
myUrlToString( localItem.fromUrl ), localItem.thisUrlStr,
e.toString() );
}
return null;
}
private String myUrlToString( URL url )
{
if ( url == null )
return "an initial URL";
else
return url.toExternalForm();
}
/// Acme.HtmlObserver callback.
public void gotAHREF( String urlStr, URL contextUrl, Object clientData )
{
add( urlStr, contextUrl, (SpiderItem) clientData );
}
/// Acme.HtmlObserver callback.
public void gotIMGSRC( String urlStr, URL contextUrl, Object clientData )
{
add( urlStr, contextUrl, (SpiderItem) clientData );
}
/// Acme.HtmlObserver callback.
public void gotFRAMESRC( String urlStr, URL contextUrl, Object clientData )
{
add( urlStr, contextUrl, (SpiderItem) clientData );
}
/// Acme.HtmlObserver callback.
public void gotBASEHREF( String urlStr, URL contextUrl, Object clientData )
{
// Nothing.
}
/// Acme.HtmlObserver callback.
public void gotAREAHREF( String urlStr, URL contextUrl, Object clientData )
{
add( urlStr, contextUrl, (SpiderItem) clientData );
}
/// Acme.HtmlObserver callback.
public void gotLINKHREF( String urlStr, URL contextUrl, Object clientData )
{
add( urlStr, contextUrl, (SpiderItem) clientData );
}
/// Acme.HtmlObserver callback.
public void gotBODYBACKGROUND( String urlStr, URL contextUrl, Object clientData )
{
add( urlStr, contextUrl, (SpiderItem) clientData );
}
private void add( String urlStr, URL contextUrl, SpiderItem item )
{
try
{
// Convert to no-ref, canonical form.
URL url = Acme.Utils.plainUrl( contextUrl, urlStr );
urlStr = url.toExternalForm();
// Add it.
addOne( urlStr, contextUrl, item );
// Also add all parent directories up to the root. We'll get
// a lot of duplicates this way, but the hashtable will filter
// them out.
String rootUrlStr =
( new URL( new URL( urlStr ), "/" ) ).toExternalForm();
while ( urlStr.length() > rootUrlStr.length() )
{
int lastSlash = urlStr.lastIndexOf( '/', urlStr.length() - 2 );
urlStr = urlStr.substring( 0, lastSlash + 1 );
addOne( urlStr, contextUrl, item );
}
}
catch ( MalformedURLException e )
{
String msg = e.getMessage();
if ( checkMalformedURL( msg ) )
brokenLink( myUrlToString( contextUrl ), urlStr, msg );
}
}
private void addOne( String urlStr, URL contextUrl, SpiderItem item )
{
// Check if we've already done this one.
if ( ! done.containsKey( urlStr ) )
{
// Check if we should do this one.
if ( doThisUrl( urlStr, item.depth + 1, item.baseUrlStr ) )
{
// Yes.
if ( todoLimit == 0 || todo.size() < todoLimit )
todo.addBack( new SpiderItem(
urlStr, contextUrl, item.depth + 1,
item.baseUrlStr ) );
}
}
}
/// Test program. Shows URLs, file sizes, etc. at the ACME Java site.
public static void main( String[] args )
{
if ( args.length != 1 )
{
System.err.println( "usage: Spider URL" );
return;
}
Enumeration as;
try
{
// as = new Spider( args[0] );
as = new Spider( args[0], System.err );
}
catch ( MalformedURLException e )
{
System.err.println( e );
return;
}
while ( as.hasMoreElements() )
{
URLConnection uc = (URLConnection) as.nextElement();
URL thisUrl = uc.getURL();
String thisUrlStr = thisUrl.toExternalForm();
String mimeType = uc.getContentType();
int bytes = 0;
try
{
InputStream s = uc.getInputStream();
while ( s.read() != -1 )
++bytes;
s.close();
}
catch ( IOException e ) {}
System.out.println( thisUrlStr + " " + mimeType + " " + bytes );
}
}
}
// A struct class to hold info on a queued URL.
class SpiderItem
{
String thisUrlStr;
URL fromUrl;
int depth;
String baseUrlStr;
public SpiderItem(
String thisUrlStr, URL fromUrl, int depth, String baseUrlStr )
{
this.thisUrlStr = thisUrlStr;
this.fromUrl = fromUrl;
this.depth = depth;
this.baseUrlStr = baseUrlStr;
}
}
// SpiderConnection - utility class for Spider
//
// A SpiderConnection is the type returned by Spider. It's a
// URLConnection, slightly modified internally to work with Spider.
class SpiderConnection extends URLConnection
{
private URLConnection uc;
protected InputStream s;
public SpiderConnection( URLConnection uc ) throws IOException
{
super( uc.getURL() );
this.uc = uc;
this.s = uc.getInputStream();
}
public SpiderConnection( URLConnection uc, InputStream s )
{
super( uc.getURL() );
this.uc = uc;
this.s = s;
}
public InputStream getInputStream() throws IOException
{
return s;
}
// The rest just forward to uc's methods.
final public void connect() throws IOException
{
uc.connect();
}
final public URL getURL()
{
return uc.getURL();
}
final public int getContentLength()
{
return uc.getContentLength();
}
final public String getContentType()
{
return uc.getContentType();
}
final public String getContentEncoding()
{
return uc.getContentEncoding();
}
final public long getExpiration()
{
return uc.getExpiration();
}
final public long getDate()
{
return uc.getDate();
}
final public long getLastModified()
{
return uc.getLastModified();
}
final public String getHeaderField( String name )
{
return uc.getHeaderField( name );
}
final public int getHeaderFieldInt( String name, int Default )
{
return uc.getHeaderFieldInt( name, Default );
}
final public long getHeaderFieldDate( String name, long Default )
{
return uc.getHeaderFieldDate( name, Default );
}
final public String getHeaderFieldKey( int n )
{
return uc.getHeaderFieldKey( n );
}
final public String getHeaderField( int n )
{
return uc.getHeaderField( n );
}
final public Object getContent() throws IOException
{
return uc.getContent();
}
final public OutputStream getOutputStream() throws IOException
{
return uc.getOutputStream();
}
final public String toString()
{
return uc.toString();
}
final public void setDoInput( boolean doinput )
{
uc.setDoInput( doinput );
}
final public boolean getDoInput()
{
return uc.getDoInput();
}
final public void setDoOutput( boolean dooutput )
{
uc.setDoOutput( dooutput );
}
final public boolean getDoOutput()
{
return uc.getDoOutput();
}
final public void setAllowUserInteraction( boolean allowuserinteraction )
{
uc.setAllowUserInteraction( allowuserinteraction );
}
final public boolean getAllowUserInteraction()
{
return uc.getAllowUserInteraction();
}
final public static void setDefaultAllowUserInteraction(
boolean defaultallowuserinteraction )
{
URLConnection.setDefaultAllowUserInteraction(
defaultallowuserinteraction );
}
final public static boolean getDefaultAllowUserInteraction()
{
return URLConnection.getDefaultAllowUserInteraction();
}
final public void setUseCaches( boolean usecaches )
{
uc.setUseCaches( usecaches );
}
final public boolean getUseCaches()
{
return uc.getUseCaches();
}
final public void setIfModifiedSince( long ifmodifiedsince )
{
uc.setIfModifiedSince( ifmodifiedsince );
}
final public long getIfModifiedSince()
{
return uc.getIfModifiedSince();
}
final public boolean getDefaultUseCaches()
{
return uc.getDefaultUseCaches();
}
final public void setDefaultUseCaches( boolean defaultusecaches )
{
uc.setDefaultUseCaches( defaultusecaches );
}
final public void setRequestProperty( String key, String value )
{
uc.setRequestProperty( key, value );
}
final public String getRequestProperty( String key )
{
return uc.getRequestProperty( key );
}
final public static void setDefaultRequestProperty(
String key, String value )
{
URLConnection.setDefaultRequestProperty( key, value );
}
final public static String getDefaultRequestProperty( String key )
{
return URLConnection.getDefaultRequestProperty( key );
}
final public static void setContentHandlerFactory(
ContentHandlerFactory fac )
{
URLConnection.setContentHandlerFactory( fac );
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -