📄 htmlpage.java

📁 利用广度优先遍历搜索一定范围内的所有网页,可用于建立搜索引擎和查找网络错误.
💻 JAVA
字号:
/**
 * Represents an HTML page.
 *
 * @author  SeungJin Lim
 * @version 1.0, 2006/10/20
 * @since   JDK1.5
 */
package html;

import java.net.MalformedURLException;
import java.net.BindException;
import java.net.URL;
import java.net.UnknownHostException;
import java.io.BufferedInputStream;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Vector;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;

import WebCrawler.Link;


public class HtmlPage {
	
	public HtmlPage( URL url ) throws IOException {
		this( url, System.out );
	}
	public HtmlPage( URL url, PrintStream out ) throws IOException {
		if( null==url ) return;
		
		this.url = url;
		this.out = out;
		
		if( _DEBUG )
			out.println("[HtmlPage] url: "+url.toString());
		//Link link = new Link( null, url.toString() );
		//if( _DEBUG )
		//	System.out.println("[HtmlPage] Link: "+link);
		//this.url = link.getURL();

		//if( this.url==null ) return;

		//BufferedReader reader = new BufferedReader(new InputStreamReader( url().openStream() ));
		//String line;
    	//while( (line = reader.readLine ()) != null ) {
    	//	line = line.trim();
		//	System.out.println( line );
    	//} // while
    	
		try {
    	BufferedInputStream in = new BufferedInputStream( url().openStream() );
		Tidy tidy = new Tidy();
		tidy.setQuiet( true );					// do not print "Tidy (vers 4th August 2000) Parsing "InputStream""
		//tidy.setMakeClean( true );			// remove presentational clutter
		//tidy.setXmlTags( true );				// treat input as XML
		//tidy.setXmlOut( true );				// create output as XML
		tidy.setXHTML( true );					// output extensible HTML
		tidy.setCharEncoding( org.w3c.tidy.Configuration.UTF8 );
		tidy.setDropFontTags( false );			// discard presentation tags
		tidy.setDropEmptyParas( true );			// discard empty p elements
		tidy.setMakeClean( true );				// remove presentational clutter
		// I am not sure how these two methods work.
		//tidy.setNumEntities( false );			// do not use numeric entities
		//tidy.setQuoteAmpersand( true );		// output naked ampersand as &
		tidy.setEncloseBlockText( true );		// if true text in blocks is wrapped in <p>'s
		//tidy.setRawOut( true );				// avoid mapping values > 127 to entities
		//tidy.setQuoteNbsp( true );			// output non-breaking space as entity
		tidy.setShowWarnings( false );			// show warnings. however errors are always shown.

		Document doc = tidy.parseDOM(in, null);
		tidy.setTabsize(4);
		tidy.setSpaces(4);
		tidy.setIndentContent(true);
		//tidy.pprint(doc, System.out);
		links = new Vector<Link>(0);
		process( doc );
		in.close();
		} catch (FileNotFoundException ex) {
			out.println("[HtmlPage] DEAD LINK: "+url());
		} catch (UnknownHostException ex) {
			out.println("[HtmlPage] UNREACHABLE HOST: "+url());
		} catch (BindException ex) {
			out.println("[HtmlPage] BindException: "+ex.getMessage()+" by "+url());
		} catch (Exception ex) {
			out.println("[HtmlPage] "+ex.getMessage()+ ", caused by "+url());
			ex.printStackTrace( out );
		}
	}

	private void process( Node node ) throws MalformedURLException, UnsupportedEncodingException {
		int type = node.getNodeType();
		switch ( type ) {
		case Node.DOCUMENT_NODE:
			if(_DEBUG)
				System.out.println("[HtmlPage.process] DOCUMENT_NODE: "+node.getNodeName()+"...");
			process(((Document)node).getDocumentElement());
			break;

		case Node.DOCUMENT_TYPE_NODE:
			if(_DEBUG)
				System.out.println("[HtmlPage.process] DOCUMENT_TYPE_NODE: "+node.getNodeName()+"...");
			break;

		case Node.ELEMENT_NODE:
			if(_DEBUG)
				System.out.println("[HtmlPage.process] ELEMENT_NODE: "+node.getNodeName()+"...");
			
			if( node.getNodeName().equalsIgnoreCase("A") ) {
				StringBuffer sb = new StringBuffer();
				sb.append('<');
				sb.append(node.getNodeName().toUpperCase());

				NamedNodeMap attrs = node.getAttributes();
				for( int i = 0; i < attrs.getLength(); i++ ) {
					sb.append(' ');
					sb.append(attrs.item(i).getNodeName());
					sb.append("=\"");

					sb.append(attrs.item(i).getNodeValue());
					sb.append('"');
				} // for i
				sb.append('>');
				if(_DEBUG)
					out.println("[HtmlPage.process] \t"+sb+"...");
				String link = sb.toString(); 
				if( null!=link && !link.equals("") ) {
					//Link l = new Link(new AnchorElement(link), url());
					AnchorElement anchor = new AnchorElement(link);
					Link l = new Link(url(), anchor.getHref(), true, out );
					if( l!=null ) {
						links.addElement(l);
					} // if l
					else {
						out.println("[HtmlPage] NULL LINK by "+link);
					}
				} // if link
			} // if node.getNodeName()
			NodeList children = node.getChildNodes();
			if( children != null ) {
				int len = children.getLength();
				for( int i = 0; i < len; i++ ) {
					process(children.item(i));
				} // for i
			} // if children
			break;

		case Node.TEXT_NODE:
			//System.out.println("[HtmlPage.process] TEXT_NODE: "+node.getNodeName()+"...");
			break;

		case Node.COMMENT_NODE:
			if(_DEBUG)
				out.println("[HtmlPage.process] COMMENT_NODE: "+node.getNodeName()+"...");
			break;

		default:
			if(_DEBUG)
				out.println("[HtmlPage.process] What else: "+node.getNodeName()+","+node.getNodeValue()+"???");
			break;
		} // switch
	} // process()
	
	public Link linkAt( int index ) {
		return links.elementAt(index);
	}
	public Vector<Link> links() {
		return links;
	}
	public int linkCount() {
		if( links!=null ) return links.size();
		else return -1;
	}
	public URL url() {
		return url;
	}
	
	
	public static void main( String[] arg ) {
		try {
			//HtmlPage page = new HtmlPage( new URL("http://www.cs.usu.edu/~/lim/index.html") );
			//HtmlPage page = new HtmlPage( new URL("http://www.cs.usu.edu/~lim/") );
			//HtmlPage page = new HtmlPage( new URL("http://www.cs.usu.edu/~lim/dm06/index.html") );
			//HtmlPage page = new HtmlPage( new URL("http://www.cs.usu.edu/~lim/db06/index.html") );
			HtmlPage page = new HtmlPage( new URL("http://localhost/index.html.en") );
			//HtmlPage page = new HtmlPage( new URL("http://localhost") );
			System.out.println( "Copied from "+page.url() );
			System.out.println( "Number of links: "+page.linkCount() );
			for( int i=0; i<page.linkCount(); i++ ) {
				System.out.println("\t"+page.linkAt(i));
			}
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	/**
	 * The source URL of this page.
	 */
	private URL url;
	/**
	 * Links contained in this page.
	 */
	Vector<Link> links;
	/**
	 * 
	 */
	PrintStream out;
	
	boolean _DEBUG = false;
}
💿 文件大小 574 K
👤 上传用户 mmmmmmmmmxxx
📂 所属分类 Java编程
🏷️ 相关标签

#搜索 #页 #搜索引擎 #网络
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -