📄 htmlpage.java
字号:
/**
* Represents an HTML page.
*
* @author SeungJin Lim
* @version 1.0, 2006/10/20
* @since JDK1.5
*/
package html;
import java.net.MalformedURLException;
import java.net.BindException;
import java.net.URL;
import java.net.UnknownHostException;
import java.io.BufferedInputStream;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Vector;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import WebCrawler.Link;
public class HtmlPage {
public HtmlPage( URL url ) throws IOException {
this( url, System.out );
}
public HtmlPage( URL url, PrintStream out ) throws IOException {
if( null==url ) return;
this.url = url;
this.out = out;
if( _DEBUG )
out.println("[HtmlPage] url: "+url.toString());
//Link link = new Link( null, url.toString() );
//if( _DEBUG )
// System.out.println("[HtmlPage] Link: "+link);
//this.url = link.getURL();
//if( this.url==null ) return;
//BufferedReader reader = new BufferedReader(new InputStreamReader( url().openStream() ));
//String line;
//while( (line = reader.readLine ()) != null ) {
// line = line.trim();
// System.out.println( line );
//} // while
try {
BufferedInputStream in = new BufferedInputStream( url().openStream() );
Tidy tidy = new Tidy();
tidy.setQuiet( true ); // do not print "Tidy (vers 4th August 2000) Parsing "InputStream""
//tidy.setMakeClean( true ); // remove presentational clutter
//tidy.setXmlTags( true ); // treat input as XML
//tidy.setXmlOut( true ); // create output as XML
tidy.setXHTML( true ); // output extensible HTML
tidy.setCharEncoding( org.w3c.tidy.Configuration.UTF8 );
tidy.setDropFontTags( false ); // discard presentation tags
tidy.setDropEmptyParas( true ); // discard empty p elements
tidy.setMakeClean( true ); // remove presentational clutter
// I am not sure how these two methods work.
//tidy.setNumEntities( false ); // do not use numeric entities
//tidy.setQuoteAmpersand( true ); // output naked ampersand as &
tidy.setEncloseBlockText( true ); // if true text in blocks is wrapped in <p>'s
//tidy.setRawOut( true ); // avoid mapping values > 127 to entities
//tidy.setQuoteNbsp( true ); // output non-breaking space as entity
tidy.setShowWarnings( false ); // show warnings. however errors are always shown.
Document doc = tidy.parseDOM(in, null);
tidy.setTabsize(4);
tidy.setSpaces(4);
tidy.setIndentContent(true);
//tidy.pprint(doc, System.out);
links = new Vector<Link>(0);
process( doc );
in.close();
} catch (FileNotFoundException ex) {
out.println("[HtmlPage] DEAD LINK: "+url());
} catch (UnknownHostException ex) {
out.println("[HtmlPage] UNREACHABLE HOST: "+url());
} catch (BindException ex) {
out.println("[HtmlPage] BindException: "+ex.getMessage()+" by "+url());
} catch (Exception ex) {
out.println("[HtmlPage] "+ex.getMessage()+ ", caused by "+url());
ex.printStackTrace( out );
}
}
private void process( Node node ) throws MalformedURLException, UnsupportedEncodingException {
int type = node.getNodeType();
switch ( type ) {
case Node.DOCUMENT_NODE:
if(_DEBUG)
System.out.println("[HtmlPage.process] DOCUMENT_NODE: "+node.getNodeName()+"...");
process(((Document)node).getDocumentElement());
break;
case Node.DOCUMENT_TYPE_NODE:
if(_DEBUG)
System.out.println("[HtmlPage.process] DOCUMENT_TYPE_NODE: "+node.getNodeName()+"...");
break;
case Node.ELEMENT_NODE:
if(_DEBUG)
System.out.println("[HtmlPage.process] ELEMENT_NODE: "+node.getNodeName()+"...");
if( node.getNodeName().equalsIgnoreCase("A") ) {
StringBuffer sb = new StringBuffer();
sb.append('<');
sb.append(node.getNodeName().toUpperCase());
NamedNodeMap attrs = node.getAttributes();
for( int i = 0; i < attrs.getLength(); i++ ) {
sb.append(' ');
sb.append(attrs.item(i).getNodeName());
sb.append("=\"");
sb.append(attrs.item(i).getNodeValue());
sb.append('"');
} // for i
sb.append('>');
if(_DEBUG)
out.println("[HtmlPage.process] \t"+sb+"...");
String link = sb.toString();
if( null!=link && !link.equals("") ) {
//Link l = new Link(new AnchorElement(link), url());
AnchorElement anchor = new AnchorElement(link);
Link l = new Link(url(), anchor.getHref(), true, out );
if( l!=null ) {
links.addElement(l);
} // if l
else {
out.println("[HtmlPage] NULL LINK by "+link);
}
} // if link
} // if node.getNodeName()
NodeList children = node.getChildNodes();
if( children != null ) {
int len = children.getLength();
for( int i = 0; i < len; i++ ) {
process(children.item(i));
} // for i
} // if children
break;
case Node.TEXT_NODE:
//System.out.println("[HtmlPage.process] TEXT_NODE: "+node.getNodeName()+"...");
break;
case Node.COMMENT_NODE:
if(_DEBUG)
out.println("[HtmlPage.process] COMMENT_NODE: "+node.getNodeName()+"...");
break;
default:
if(_DEBUG)
out.println("[HtmlPage.process] What else: "+node.getNodeName()+","+node.getNodeValue()+"???");
break;
} // switch
} // process()
public Link linkAt( int index ) {
return links.elementAt(index);
}
public Vector<Link> links() {
return links;
}
public int linkCount() {
if( links!=null ) return links.size();
else return -1;
}
public URL url() {
return url;
}
public static void main( String[] arg ) {
try {
//HtmlPage page = new HtmlPage( new URL("http://www.cs.usu.edu/~/lim/index.html") );
//HtmlPage page = new HtmlPage( new URL("http://www.cs.usu.edu/~lim/") );
//HtmlPage page = new HtmlPage( new URL("http://www.cs.usu.edu/~lim/dm06/index.html") );
//HtmlPage page = new HtmlPage( new URL("http://www.cs.usu.edu/~lim/db06/index.html") );
HtmlPage page = new HtmlPage( new URL("http://localhost/index.html.en") );
//HtmlPage page = new HtmlPage( new URL("http://localhost") );
System.out.println( "Copied from "+page.url() );
System.out.println( "Number of links: "+page.linkCount() );
for( int i=0; i<page.linkCount(); i++ ) {
System.out.println("\t"+page.linkAt(i));
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* The source URL of this page.
*/
private URL url;
/**
* Links contained in this page.
*/
Vector<Link> links;
/**
*
*/
PrintStream out;
boolean _DEBUG = false;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -