📄 link.java

📁 利用广度优先遍历搜索一定范围内的所有网页,可用于建立搜索引擎和查找网络错误.
💻 JAVA
字号:
/**
 * Represents a Web link.
 *
 * @author  SeungJin Lim
 * @version 1.0, 2006/10/20
 * @since   JDK1.5
 */
package WebCrawler;


import java.io.IOException;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import io.MyPrintStream;

public class Link {
/*	
	public Link( AnchorElement anchor, URL base ) 
	throws MalformedURLException {
		this( anchor, base, false );
	}
	public Link( AnchorElement anchor, URL base, boolean guessFileName ) 
	throws MalformedURLException {
		this( new URL(anchor.getHref()), base, false );
	}
*/	
	public Link( URL context, String spec ) {
		this( context, spec, false, MyPrintStream.out );
	}
	public Link( URL context, String spec, boolean guessFileName ) {
		this( context, spec, guessFileName, MyPrintStream.out );
	}
	/**
	 * @deprecated
	 * @param context
	 * @param spec
	 * @param out
	 */
	public Link( URL context, String spec, PrintStream out ) {
		this( context, spec, false, out );
	}
	public Link( URL context, String spec, boolean guessFileName, PrintStream printstream ) {
		if( null==spec ) return;
	
		if( null==printstream ) {
			this.out = System.out;
		}
		else {
			this.out = printstream;
		}
		
		this.context = context;
		
		try {
			URL url = new URL( context, spec );
			
			if( _DEBUG) {
			out.println("[Link] url: "+url);
			out.println("[Link] protocol: "+url.getProtocol());
			out.println("[Link] host: "+url.getHost());
			out.println("[Link] port: "+url.getPort());
			out.println("[Link] default port: "+url.getDefaultPort());
			out.println("[Link] file: "+url.getFile());
			out.println("[Link] query: "+url.getQuery());
			}
			
			if( !url.getProtocol().equals("http") ) {
				out.println("[Link] Unsupported protocol: "+url);
				return;
			}
		
			// This url has the protocol and the host only without the ending document root "/".
			if( null==url.getFile() || url.getFile().equals("") ) {
				url = new URL( url.toString()+"/" );
			}
			
			// This url has no file name. Let's guess what it is.
			if( url.toString().endsWith("/") && guessFileName ) {
			URL guess = new URL( url.toString()+"index.html" );
			try {
				//guess.openConnection();
				guess.openStream();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				//e.printStackTrace();
				try {
					guess = new URL( url.toString()+"index.htm" );
					guess.openStream();
				} catch (IOException e2) {
					out.println("[Link] cannot guess the file name for "+url);
					guess = null;
				}
			}
			if( guess!=null )
				url = guess;
			} // if url
			this.url = url;
		} catch (MalformedURLException ex) {
			out.println("[Link] MalformedURLException: "+spec);
			//ex.printStackTrace();
		}
	}

	public URL getContext() {
		return this.context;
	}
	public String getDomain() {
		String buf = getURL().getHost();

		int idx = buf.lastIndexOf(".");
		if( idx<0 )
			return buf;
		
		try {
			String ret = buf.substring( idx );
			buf = buf.substring(0, idx);
			idx = buf.lastIndexOf(".");
			
			if( idx<0 )
				return buf+ret;
			
			ret = buf.substring( idx+1 ) + ret;
			return ret;
		} catch (StringIndexOutOfBoundsException ex) {
			System.err.println("[Link.getDomain] out of bounds: "+getURL().getHost());
			System.exit(1);
		}
		return null;
	}
	public URL getURL() {
		return url;
	}
	public boolean isHTTP() {
		if( null==getURL() ) {
			return false;
		}
		return getURL().getProtocol().equals("http");
	}
	public boolean isHTML() {
		if( null==getURL() ) {
			return false;
		}
		String buf = getURL().toString();
		boolean ret = buf.indexOf(".htm")>=0 || buf.indexOf(".shtml")>=0 || buf.indexOf(".cfm")>=0; 
		return ret;
	}
	public void setPrintStream( PrintStream out ) {
		this.out = out;
	}
	
	public String toString() {
		if( getURL()!=null ) return getURL().toString();
		else return null;
	}
	
	public static void main( String[] args ) 
	throws MalformedURLException, UnsupportedEncodingException {
		String[] anchors = {
			//"<a href=\"http://www.cs.usu.edu/\">",
			//"<a href=\"doc/index.html\">",
			//"<a href=\"../doc/index.html\">",
			//"<a href=\"http://www.cs.usu.edu/public_html/index.html\">"
			"http://www.cs.usu.edu",
			"doc/index.html",
			"../doc/index.html",
			"http://www.cs.usu.edu/public_html/index.html",
			"mailto:lim@cc.usu.edu",
			"http://localhost/pub/index.html",
			"http://students.net/"
		};
		String[] bases = {
				"http://www.cs.usu.edu/public_html/index.html",
				"http://www.cs.usu.edu/public_html/index.html",
				"http://www.cs.usu.edu/public_html/index.html",
				null,
				null,
				null,
				null
			};
		
		for( int i=0; i<anchors.length; i++ ) {
			try {
				System.out.println( bases[i]+", "+anchors[i]);
				Link link = new Link( bases[i]!=null ? new URL(bases[i]) : null, anchors[i] );
				//if( null!=link.getURL() )
				if( null!=link.toString() )
					System.out.println("\tLink: "+link + ", Domain: " + link.getDomain());
			} catch (NullPointerException ex) {
				ex.printStackTrace();
			}
		} // for i
	} // main
	
	/**
	 * URL representing this Link.
	 */
	private URL context = null;
	private URL url = null;
	private PrintStream out = null;
	private boolean _DEBUG = false;
	
}
💿 文件大小 574 K
👤 上传用户 mmmmmmmmmxxx
📂 所属分类 Java编程
🏷️ 相关标签

#搜索 #页 #搜索引擎 #网络
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -