📄 htmltagscanner.java.svn-base

📁 梦界家园程序开发基底框架
💻 SVN-BASE
字号:
// HTMLParser Library v1.1 - A java-based parser for HTML
// Copyright (C) Dec 31, 2000 Somik Raha
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// For any questions or suggestions, you can write to me at :
// Email :somik@kizna.com
// 
// Postal Address : 
// Somik Raha
// R&D Team
// Kizna Corporation
// Hiroo ON Bldg. 2F, 5-19-9 Hiroo,
// Shibuya-ku, Tokyo, 
// 150-0012, 
// JAPAN
// Tel  :  +81-3-54752646
// Fax : +81-3-5449-4870
// Website : www.kizna.com

package jm.util.html.scanners;
//////////////////
// Java Imports //
//////////////////
import java.io.IOException;

import jm.util.html.HTMLNode;
import jm.util.html.HTMLReader;
import jm.util.html.HTMLStringNode;
import jm.util.html.tags.HTMLEndTag;
import jm.util.html.tags.HTMLTag;
/**
 * HTMLTagScanner is an abstract superclass which is subclassed to create specific 
 * scanners, that operate on a tag's strings, identify it, and can extract data from it.
 */
public abstract class HTMLTagScanner
{
	/**
	 * A filter which is used to associate this tag. The filter contains a string
	 * that is used to match which tags are to be allowed to pass through. This can
	 * be useful when one wishes to dynamically filter out all tags except one type
	 * which may be programmed later than the parser. Is also useful for command line
	 * implementations of the parser.
	 */
	protected String filter;
	/**
	 * Default Constructor, automatically registers the scanner into a static array of 
	 * scanners inside HTMLTag
	 */
	public HTMLTagScanner()
	{
		this.filter="";
	}
	/**
	 * This constructor automatically registers the scanner, and sets the filter for this
	 * tag. 
	 * @param filter The filter which will allow this tag to pass through.
	 */
	public HTMLTagScanner(String filter)
	{
		this.filter=filter;
	}
/**
 * Insert the method's description here.
 * Creation date: (6/4/2001 11:44:09 AM)
 * @return java.lang.String
 * @param c char
 */
public String absorb(String s,char c) {
	int index = s.indexOf(c);
	if (index!=-1)	s=s.substring(index+1,s.length());
	return s;
}
/**
 * Insert the method's description here.
 * Creation date: (6/18/2001 2:15:02 AM)
 * @return java.lang.String
 */
public static String absorbLeadingBlanks(String s) 
{
	String temp = new String(s);
	while (temp.charAt(0)==' ')
	{
		temp = temp.substring(1,temp.length());
	}
	return temp;
}
	/**
	 * Template Method, used to decide if this scanner can handle this tag type. If the
	 * evaluation returns true, the calling side makes a call to scan().
	 * @param s The complete text contents of the HTMLTag.
	 * @param previousOpenScanner Indicates any previous scanner which hasnt completed, before the current
	 * scan has begun, and hence allows us to write scanners that can work with dirty html
	 */
	public abstract boolean evaluate(String s,HTMLTagScanner previousOpenScanner);
/**
 * Insert the method's description here.
 * Creation date: (6/4/2001 11:30:03 AM)
 * @param tag com.kizna.html.HTMLTag
 */
public String extractField(HTMLTag tag,String field) 
{
    String s = tag.getText();
    int fieldPos = s.toUpperCase().indexOf(field);
    if (fieldPos == -1) {
        return "";
    }
    int loc = fieldPos + field.length();
    s = s.substring(loc, s.length());
    s = absorb(s, '=');
    s = absorb(s, '"');
    int n = s.indexOf('"');
    // If n is -1, it means that there are no inverted commas, so we use space as a delimiter
    if (n==-1) n=s.indexOf(' ');
    // If n is still -1, it means that the string has ended, so we go till the end of the string
    if (n!=-1)
    s = s.substring(0, n);
	return s;
}
public static String extractXMLData(HTMLNode node, String tagName, HTMLReader reader) {
	
	String xmlData = "";

	boolean xmlTagFound = isXMLTagFound(node, tagName);
	if (xmlTagFound) {
		try{
			do {
				node = reader.readElement();
				if (node!=null) {
					if (node instanceof HTMLStringNode) {
						HTMLStringNode stringNode = (HTMLStringNode)node;
						if (xmlData.length()>0) xmlData+=" ";
						xmlData += stringNode.getText();
					} else if (!(node instanceof HTMLEndTag))
						xmlTagFound = false;
				}
			}
			while (node instanceof HTMLStringNode);
			
		}catch (IOException e) {}
	}
	if (xmlTagFound) {
			if (node!=null) {
				if (node instanceof HTMLEndTag) {
					HTMLEndTag endTag = (HTMLEndTag)node;
					if (!endTag.getContents().equals(tagName)) xmlTagFound = false;		
				}
			
			}

	}
	if (xmlTagFound) return xmlData; else return null;
}
	/**
	 * Get the filter associated with this node.
	 */
	public String getFilter()
	{
		return filter;
	}
/**
 * Insert the method's description here.
 * Creation date: (10/24/2001 6:27:02 PM)
 */
public static boolean isXMLTagFound(HTMLNode node, String tagName) {
	boolean xmlTagFound=false;
	if (node instanceof HTMLTag) {
		HTMLTag tag = (HTMLTag)node;
		if (tag.getText().toUpperCase().indexOf(tagName)==0) {
			xmlTagFound=true;
		}
	}
	return xmlTagFound;
}
	/** 
	 * Scan the tag and extract the information related to this type. The url of the 
	 * initiating scan has to be provided in case relative links are found. The initial 
	 * url is then prepended to it to give an absolute link.
	 * The HTMLReader is provided in order to do a lookahead operation. We assume that
	 * the identification has already been performed using the evaluate() method.
	 * @param tag HTML Tag to be scanned for identification
	 * @param url The initiating url of the scan (Where the html page lies)
	 * @param reader The reader object responsible for reading the html page
	 */
	public abstract HTMLNode scan(HTMLTag tag,String url,HTMLReader reader,String currLine) throws IOException;

	public String removeChars(String s,char occur)  {
		StringBuffer newString = new StringBuffer();
		char ch;
		for (int i=0;i<s.length();i++) {
			ch = s.charAt(i);
			if (ch!=occur) newString.append(ch);
		}
		return newString.toString();
	}

}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -