📄 link.java
字号:
/**
* Represents a Web link.
*
* @author SeungJin Lim
* @version 1.0, 2006/10/20
* @since JDK1.5
*/
package WebCrawler;
import java.io.IOException;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import io.MyPrintStream;
public class Link {
/*
public Link( AnchorElement anchor, URL base )
throws MalformedURLException {
this( anchor, base, false );
}
public Link( AnchorElement anchor, URL base, boolean guessFileName )
throws MalformedURLException {
this( new URL(anchor.getHref()), base, false );
}
*/
public Link( URL context, String spec ) {
this( context, spec, false, MyPrintStream.out );
}
public Link( URL context, String spec, boolean guessFileName ) {
this( context, spec, guessFileName, MyPrintStream.out );
}
/**
* @deprecated
* @param context
* @param spec
* @param out
*/
public Link( URL context, String spec, PrintStream out ) {
this( context, spec, false, out );
}
public Link( URL context, String spec, boolean guessFileName, PrintStream printstream ) {
if( null==spec ) return;
if( null==printstream ) {
this.out = System.out;
}
else {
this.out = printstream;
}
this.context = context;
try {
URL url = new URL( context, spec );
if( _DEBUG) {
out.println("[Link] url: "+url);
out.println("[Link] protocol: "+url.getProtocol());
out.println("[Link] host: "+url.getHost());
out.println("[Link] port: "+url.getPort());
out.println("[Link] default port: "+url.getDefaultPort());
out.println("[Link] file: "+url.getFile());
out.println("[Link] query: "+url.getQuery());
}
if( !url.getProtocol().equals("http") ) {
out.println("[Link] Unsupported protocol: "+url);
return;
}
// This url has the protocol and the host only without the ending document root "/".
if( null==url.getFile() || url.getFile().equals("") ) {
url = new URL( url.toString()+"/" );
}
// This url has no file name. Let's guess what it is.
if( url.toString().endsWith("/") && guessFileName ) {
URL guess = new URL( url.toString()+"index.html" );
try {
//guess.openConnection();
guess.openStream();
} catch (IOException e) {
// TODO Auto-generated catch block
//e.printStackTrace();
try {
guess = new URL( url.toString()+"index.htm" );
guess.openStream();
} catch (IOException e2) {
out.println("[Link] cannot guess the file name for "+url);
guess = null;
}
}
if( guess!=null )
url = guess;
} // if url
this.url = url;
} catch (MalformedURLException ex) {
out.println("[Link] MalformedURLException: "+spec);
//ex.printStackTrace();
}
}
public URL getContext() {
return this.context;
}
public String getDomain() {
String buf = getURL().getHost();
int idx = buf.lastIndexOf(".");
if( idx<0 )
return buf;
try {
String ret = buf.substring( idx );
buf = buf.substring(0, idx);
idx = buf.lastIndexOf(".");
if( idx<0 )
return buf+ret;
ret = buf.substring( idx+1 ) + ret;
return ret;
} catch (StringIndexOutOfBoundsException ex) {
System.err.println("[Link.getDomain] out of bounds: "+getURL().getHost());
System.exit(1);
}
return null;
}
public URL getURL() {
return url;
}
public boolean isHTTP() {
if( null==getURL() ) {
return false;
}
return getURL().getProtocol().equals("http");
}
public boolean isHTML() {
if( null==getURL() ) {
return false;
}
String buf = getURL().toString();
boolean ret = buf.indexOf(".htm")>=0 || buf.indexOf(".shtml")>=0 || buf.indexOf(".cfm")>=0;
return ret;
}
public void setPrintStream( PrintStream out ) {
this.out = out;
}
public String toString() {
if( getURL()!=null ) return getURL().toString();
else return null;
}
public static void main( String[] args )
throws MalformedURLException, UnsupportedEncodingException {
String[] anchors = {
//"<a href=\"http://www.cs.usu.edu/\">",
//"<a href=\"doc/index.html\">",
//"<a href=\"../doc/index.html\">",
//"<a href=\"http://www.cs.usu.edu/public_html/index.html\">"
"http://www.cs.usu.edu",
"doc/index.html",
"../doc/index.html",
"http://www.cs.usu.edu/public_html/index.html",
"mailto:lim@cc.usu.edu",
"http://localhost/pub/index.html",
"http://students.net/"
};
String[] bases = {
"http://www.cs.usu.edu/public_html/index.html",
"http://www.cs.usu.edu/public_html/index.html",
"http://www.cs.usu.edu/public_html/index.html",
null,
null,
null,
null
};
for( int i=0; i<anchors.length; i++ ) {
try {
System.out.println( bases[i]+", "+anchors[i]);
Link link = new Link( bases[i]!=null ? new URL(bases[i]) : null, anchors[i] );
//if( null!=link.getURL() )
if( null!=link.toString() )
System.out.println("\tLink: "+link + ", Domain: " + link.getDomain());
} catch (NullPointerException ex) {
ex.printStackTrace();
}
} // for i
} // main
/**
* URL representing this Link.
*/
private URL context = null;
private URL url = null;
private PrintStream out = null;
private boolean _DEBUG = false;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -