📄 htmlparser.java.svn-base
字号:
// HTMLParser Library v1.1 - A java-based parser for HTML
// Copyright (C) Dec 31, 2000 Somik Raha
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// For any questions or suggestions, you can write to me at :
// Email :somik@kizna.com
//
// Postal Address :
// Somik Raha
// R&D Team
// Kizna Corporation
// Hiroo ON Bldg. 2F, 5-19-9 Hiroo,
// Shibuya-ku, Tokyo,
// 150-0012,
// JAPAN
// Tel : +81-3-54752646
// Fax : +81-3-5449-4870
// Website : www.kizna.com
package jm.util.html;
//////////////////
// Java Imports //
//////////////////
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Enumeration;
import java.util.Vector;
import jm.util.html.scanners.HTMLAppletScanner;
import jm.util.html.scanners.HTMLImageScanner;
import jm.util.html.scanners.HTMLJspScanner;
import jm.util.html.scanners.HTMLLinkScanner;
import jm.util.html.scanners.HTMLMetaTagScanner;
import jm.util.html.scanners.HTMLScriptScanner;
import jm.util.html.scanners.HTMLStyleScanner;
import jm.util.html.scanners.HTMLTagScanner;
import jm.util.html.scanners.HTMLTitleScanner;
import jm.util.html.tags.HTMLTag;
import jm.util.html.util.HTMLLinkProcessor;
/**
* This is the class that the user will use, either to get an iterator into
* the html page or to directly parse the page and print the results
* <BR>
* Typical usage of the parser is as follows : <BR>
* [1] Create a parser object - passing the URL to the parser<BR>
* [2] Register the common scanners. See {@link #registerScanners()} <BR>
* You wouldnt do this if you want to configure a custom lightweight parser. In that case,
* you would add the scanners of your choice using {@link #addScanner(HTMLTagScanner)}<BR>
* [3] Enumerate through the elements from the parser object <BR>
* It is important to note that the parsing occurs when you enumerate, ON DEMAND. This is a thread-safe way,
* and you only get the control back after a particular element is parsed and returned.
*
* <BR>
* Below is some sample code to parse Yahoo.com and print all the tags.
* <pre>
* HTMLParser parser = new HTMLParser("http://www.yahoo.com");
* // In this example, we are registering all the common scanners
* parser.registerScanners();
* for (Enumeration e = parser.elements();e.hasMoreElements();) {
* HTMLNode node = (HTMLNode)e.nextElement();
* node.print();
* }
* </pre>
* Below is some sample code to parse Yahoo.com and print only the text information. This scanning
* will run faster, as there are no scanners registered here.
* <pre>
* HTMLParser parser = new HTMLParser("http://www.yahoo.com");
* // In this example, none of the scanners need to be registered
* // as a string node is not a tag to be scanned for.
* for (Enumeration e = parser.elements();e.hasMoreElements();) {
* HTMLNode node = (HTMLNode)e.nextElement();
* if (node instanceof HTMLStringNode) {
* HTMLStringNode stringNode = (HTMLStringNode)node;
* System.out.println(stringNode.getText());
* }
* }
* </pre>
* The above snippet will print out only the text contents in the html document.<br>
* Here's another snippet that will only print out the link urls in a document.
* This is an example of adding a link scanner.
* <pre>
* HTMLParser parser = new HTMLParser("http://www.yahoo.com");
* parser.addScanner(new HTMLLinkScanner("-l"));
* for (Enumeration e = parser.elements();e.hasMoreElements();) {
* HTMLNode node = (HTMLNode)e.nextElement();
* if (node instanceof HTMLLinkTag) {
* HTMLLinkTag linkTag = (HTMLLinkTag)node;
* System.out.println(linkTag.getLink());
* }
* }
* </pre>
* @see HTMLParser#elements()
*/
public class HTMLParser
{
/**
* The URL or filename to be parsed.
*/
protected String resourceLocn;
/**
* The html reader associated with this parser
*/
protected HTMLReader reader;
/**
* The last read HTML node.
*/
protected HTMLNode node;
/**
* Keeps track of whether the first reading has been performed.
*/
protected boolean readFlag = false;
private Vector scanners = new Vector();
public final static java.lang.String VERSION_STRING = "1.1 (Released Apr 7, 2002)";
/**
* This constructor enables the construction of test cases, with readers
* associated with test string buffers.
* @param reader com.kizna.html.HTMLReader
*/
public HTMLParser(HTMLReader reader)
{
this.reader = reader;
reader.setParser(this);
}
/**
* Creates a HTMLParser object with the location of the resource (URL or file)
* @param resourceLocn Either the URL or the filename (autodetects)
*/
public HTMLParser(String resourceLocn)
{
this.resourceLocn = resourceLocn;
openConnection();
}
/**
* Add a new Tag Scanner.
* In typical situations where you require a no-frills parser, use the registerScanners() method to add the most
* common parsers. But when you wish to either compose a parser with only certain scanners registered, use this method.
* It is advantageous to register only the scanners you want, in order to achieve faster parsing speed. This method
* would also be of use when you have developed custom scanners, and need to register them into the parser.
* @param scanner HTMLTagScanner object (or derivative) to be added to the list of registered scanners
*/
@SuppressWarnings("unchecked")
public void addScanner(HTMLTagScanner scanner) {
scanners.addElement(scanner);
}
private String checkEnding(String link)
{
// Check if the link ends in html, htm, or /. If not, add a slash
// int l1 = link.indexOf("html");
// int l2 = link.indexOf("htm");
// int l3 = link.indexOf("php");
// int l4 = link.indexOf("jsp");
/*if (l1==-1 && l2==-1 && l3==-1 && l4==-1)
{
if (link.charAt(link.length()-1)!='/')
{
link+="/index.html";
}
return link;
} else */return link;
}
/**
* Returns an iterator (enumeration) to the html nodes. Each node can be a tag/endtag/
* string/link/image<br>
* This is perhaps the most important method of this class. In typical situations, you will need to use
* the parser like this :
* <pre>
* HTMLParser parser = new HTMLParser("http://www.yahoo.com");
* parser.registerScanners();
* for (Enumeration e = parser.elements();e.hasMoreElements();) {
* HTMLNode node = (HTMLNode)e.nextElement();
* if (node instanceof HTMLStringNode) {
* // Downcasting to HTMLStringNode
* HTMLStringNode stringNode = (HTMLStringNode)node;
* // Do whatever processing you want with the string node
* System.out.println(stringNode.getText());
* }
* // Check for the node or tag that you want
* if (node instanceof ...) {
* // Downcast, and process
* }
* }
* </pre>
*/
public Enumeration elements()
{
return new Enumeration()
{
public boolean hasMoreElements()
{
if (reader==null) return false;
try
{
node = reader.readElement();
readFlag=true;
if (node==null)
return false;
else
return true;
}
catch (IOException e)
{
System.err.println("I/O Exception occured while reading "+resourceLocn);
return false;
}
}
public Object nextElement()
{
try
{
if (!readFlag) node = reader.readElement();
return node;
}
catch (IOException e)
{
System.err.println("I/O Exception occured while reading "+resourceLocn);
return null;
}
}
};
}
/**
* Flush the current scanners registered. The registered scanners list becomes empty with this call.
*/
public void flushScanners() {
scanners = new Vector();
}
/**
* Get the number of scanners registered currently in the scanner.
* @return int number of scanners registered
*/
public int getNumScanners() {
return scanners.size();
}
/**
* Get an enumeration of scanners registered currently in the parser
* @return Enumeration of scanners currently registered in the parser
*/
public Enumeration getScanners() {
return scanners.elements();
}
/*
* The main program, which can be executed from the command line
*/
public static void main(String [] args)
{
System.out.println("HTMLParser v"+VERSION_STRING);
if (args.length<1 || args[0].equals("-help"))
{
System.out.println();
System.out.println("Syntax : java -jar htmlparser.jar <resourceLocn/website> -l");
System.out.println(" <resourceLocn> the name of the file to be parsed (with complete path if not in current directory)");
System.out.println(" -l Show only the link tags extracted from the document");
System.out.println(" -i Show only the image tags extracted from the document");
System.out.println(" -s Show only the Javascript code extracted from the document");
System.out.println(" -t Show only the Style code extracted from the document");
System.out.println(" -a Show only the Applet tag extracted from the document");
System.out.println(" -j Parse JSP tags");
System.out.println(" -m Parse Meta tags");
System.out.println(" -t Extract the Title");
System.out.println(" -help This screen");
System.out.println();
System.out.println("HTML Parser home page : http://htmlparser.sourceforge.net");
System.out.println();
System.out.println("Example : java -jar htmlparser.jar http://www.yahoo.com");
System.out.println();
System.out.println("If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. ");
System.exit(-1);
}
if (args[0].indexOf("http")!=-1 || args[0].indexOf("www.")!=-1)
System.out.println("Parsing website "+args[0]);
else
System.out.println("Parsing file "+args[0]+"...");
HTMLParser parser = new HTMLParser(args[0]);
parser.registerScanners();
if (args.length==2)
{
parser.parse(args[1]);
} else
parser.parse(null);
}
/**
* Opens the connection with the resource to begin reading, by creating a HTML reader
* object.
*/
private void openConnection()
{
try
{
if (resourceLocn.indexOf("http")!=-1 || resourceLocn.indexOf("www.")!=-1)
{
// Its a web address
resourceLocn=HTMLLinkProcessor.removeEscapeCharacters(resourceLocn);
resourceLocn=checkEnding(resourceLocn);
URL url = new URL(resourceLocn);
URLConnection uc = url.openConnection();
reader = new HTMLReader(new BufferedReader(new InputStreamReader(uc.getInputStream(),"8859_4")),resourceLocn);
}
else
reader = new HTMLReader(new BufferedReader(new FileReader(resourceLocn)),resourceLocn);
reader.setParser(this);
}
catch (FileNotFoundException e)
{
System.err.println("Error! File "+resourceLocn+" not found!");
}
catch (MalformedURLException e)
{
System.err.println("Error! URL "+resourceLocn+" Malformed!");
}
catch (IOException e)
{
System.err.println("I/O Exception occured while reading "+resourceLocn);
}
}
/**
* Parse the given resource, using the filter provided
*/
public void parse(String filter)
{
HTMLNode node;
for (Enumeration e=elements();e.hasMoreElements();)
{
node = (HTMLNode)e.nextElement();
if (node!=null)
{
if (filter==null)
node.print();
else
{
// There is a filter. Find if the associated filter of this node
// matches the specified filter
if (!(node instanceof HTMLTag)) continue;
HTMLTag tag=(HTMLTag)node;
HTMLTagScanner scanner = tag.getThisScanner();
if (scanner==null) continue;
String tagFilter = scanner.getFilter();
if (tagFilter==null) continue;
if (tagFilter.equals(filter))
node.print();
}
}
else System.out.println("Node is null");
}
}
/**
* This method should be invoked in order to register some common scanners. The scanners that get added are : <br>
* HTMLLinkScanner (filter key "-l")<br>
* HTMLImageScanner (filter key "-i")<br>
* HTMLScriptScanner (filter key "-s") <br>
* HTMLStyleScanner (filter key "-t") <br>
* HTMLJspScanner (filter key "-j") <br>
* HTMLAppletScanner (filter key "-a") <br>
* HTMLMetaTagScanner (filter key "-m") <br>
* HTMLTitleScanner (filter key "-t") <br>
* <br>
* Call this method after creating the HTMLParser object. e.g. <BR>
* <pre>
* HTMLParser parser = new HTMLParser("http://www.yahoo.com");
* parser.registerScanners();
* </pre>
*/
public void registerScanners() {
if (scanners.size()>0) {
System.err.println("registerScanners() should be called first, when no other scanner has been registered.");
System.err.println("Other scanners already exist, hence this method call wont have any effect");
return;
}
addScanner(new HTMLLinkScanner("-l"));
addScanner(new HTMLImageScanner("-i"));
addScanner(new HTMLScriptScanner("-s"));
addScanner(new HTMLStyleScanner("-t"));
addScanner(new HTMLJspScanner("-j"));
addScanner(new HTMLAppletScanner("-a"));
addScanner(new HTMLMetaTagScanner("-m"));
addScanner(new HTMLTitleScanner("-t"));
}
/**
* Removes a specified scanner object.
* @param scanner HTMLTagScanner object to be removed from the list of registered scanners
*/
public boolean removeScanner(HTMLTagScanner scanner) {
return scanners.removeElement(scanner);
}
/**
* This method is to be used to change the set of scanners in the current parser.
* @param newScanners Vector holding scanner objects to be used during the parsing process.
*/
public void setScanners(Vector newScanners) {
scanners = newScanners;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -