📄 htmltag.java
字号:
// HTMLParser Library v1.1 - A java-based parser for HTML
// Copyright (C) Dec 31, 2000 Somik Raha
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// For any questions or suggestions, you can write to me at :
// Email :somik@kizna.com
//
// Postal Address :
// Somik Raha
// R&D Team
// Kizna Corporation
// Hiroo ON Bldg. 2F, 5-19-9 Hiroo,
// Shibuya-ku, Tokyo,
// 150-0012,
// JAPAN
// Tel : +81-3-54752646
// Fax : +81-3-5449-4870
// Website : www.kizna.com
package jm.util.html.tags;
import java.io.IOException;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.StringTokenizer;
import jm.util.html.HTMLNode;
import jm.util.html.HTMLReader;
import jm.util.html.scanners.HTMLTagScanner;
/**
* HTMLTag represents a generic tag. This class allows users to register specific
* tag scanners, which can identify links, or image references. This tag asks the
* scanners to run over the text, and identify. It can be used to dynamically
* configure a parser.
*/
public class HTMLTag implements HTMLNode
{
/**
* Constant used as value for the value of the tag name
* in parseParameters (Kaarle Kaila 3.8.2001)
*/
public final static String TAGNAME = "$<TAGNAME>$";
protected final static int TAG_BEFORE_PARSING_STATE=0;
protected final static int TAG_BEGIN_PARSING_STATE=1;
protected final static int TAG_FINISHED_PARSING_STATE=2;
protected final static int TAG_ILLEGAL_STATE=3;
protected final static int TAG_IGNORE_DATA_STATE=4;
/**
* Tag contents will have the contents of the comment tag.
*/
StringBuffer tagContents;
/**
* The beginning position of the tag in the line
*/
int tagBegin;
/**
* The ending position of the tag in the line
*/
int tagEnd;
/**
* tag parameters parsed into this hashtable
* not implemented yet
* added by Kaarle Kaila 23.10.2001
*/
private Hashtable parsed=null;
/**
* Scanner associated with this tag (useful for extraction of filtering data from a
* HTML node)
*/
protected HTMLTagScanner thisScanner=null;
private java.lang.String tagLine;
/**
* Set the HTMLTag with the beginning posn, ending posn and tag contents
* @param tagBegin Beginning position of the tag
* @param tagEnd Ending positiong of the tag
* @param tagContents The contents of the tag
* @param tagLine The current line being parsed, where the tag was found
*/
public HTMLTag(int tagBegin, int tagEnd, String tagContents, String tagLine)
{
this.tagBegin = tagBegin;
this.tagEnd = tagEnd;
this.tagContents = new StringBuffer();
this.tagContents.append(tagContents);
this.tagLine = tagLine;
}
/**
* Returns the beginning position of the string.
*/
public int elementBegin()
{
return tagBegin;
}
/**
* Returns the ending position fo the tag
*/
public int elementEnd()
{
return tagEnd;
}
/**
* Locate the tag withing the input string, by parsing from the given position
* @param reader HTML reader to be provided so as to allow reading of next line
* @param input Input String
* @param position Position to start parsing from
*/
public static HTMLTag find(HTMLReader reader,String input,int position)
{
int state = TAG_BEFORE_PARSING_STATE;
//StringBuffer tagContents = new StringBuffer();
int i=position;
char ch;
HTMLTag tag = new HTMLTag(0,0,"",input);
//System.out.println("[getTagLine:] " + tag.getTagLine());
while (i<=tag.getTagLine().length()&& state!=TAG_FINISHED_PARSING_STATE && state!=TAG_ILLEGAL_STATE)
{
// if is a null line, do not get the char
// for multi-line tag(such as jsp tag), there may be a null line.
if( i == 0 && tag.getTagLine().length() == 0 )
ch = ' ';
else {
ch = tag.getTagLine().charAt(i);
state = automataInput(state, i, ch, tag);
}
i = incrementCounter(reader, state, i, tag);
}
if (state==TAG_FINISHED_PARSING_STATE)
return tag;
else
return null;
}
public static int incrementCounter(HTMLReader reader, int state, int i, HTMLTag tag) {
if ((state==TAG_BEGIN_PARSING_STATE || state == TAG_IGNORE_DATA_STATE) && i>=tag.getTagLine().length()-1)
{
// We need to continue parsing to the next line
tag.setTagLine(reader.getNextLine());
// convert the end of line to a space
// The following line masked by Somik Raha, 15 Apr 2002, to fix space bug in links
tag.append('\n');
i=-1;
}
return ++i;
}
protected static int automataInput(int state, int i, char ch, HTMLTag tag) {
state = checkIllegalState(state, i, ch, tag);
state = checkFinishedState(state, i, ch, tag);
state = toggleIgnoringState(state, ch);
checkIfAppendable(state, ch, tag);
state = checkBeginParsingState(state, i, ch, tag);
return state;
}
private static int checkBeginParsingState(int state, int i, char ch, HTMLTag tag) {
if (ch=='<' && (state==TAG_BEFORE_PARSING_STATE || state==TAG_ILLEGAL_STATE))
{
// Transition from State 0 to State 1 - Record data till > is encountered
tag.setTagBegin(i);
state = TAG_BEGIN_PARSING_STATE;
}
return state;
}
private static void checkIfAppendable(int state, char ch, HTMLTag tag) {
if (state==TAG_IGNORE_DATA_STATE || state==TAG_BEGIN_PARSING_STATE) {
tag.append(ch);
}
}
private static int toggleIgnoringState(int state, char ch) {
if (ch=='"') {
// State 4 is ignoring mode. In this mode, we cant exit upon recieving endtag character
// This is to avoid problems with end tags within inverted commas (occuring with JSP tags).
if (state==TAG_IGNORE_DATA_STATE) state = TAG_BEGIN_PARSING_STATE; else
if (state==TAG_BEGIN_PARSING_STATE) state = TAG_IGNORE_DATA_STATE;
}
return state;
}
private static int checkFinishedState(int state, int i, char ch, HTMLTag tag) {
if (ch=='>' && state==TAG_BEGIN_PARSING_STATE)
{
state = TAG_FINISHED_PARSING_STATE;
tag.setTagEnd(i);
}
return state;
}
private static int checkIllegalState(int state, int i, char ch, HTMLTag tag) {
if (ch=='/' && i>0 && tag.getTagLine().charAt(i-1)=='<')
{
state = TAG_ILLEGAL_STATE;
}
return state;
}
protected static int automataIllegalState(String input, int state, int i, char ch) {
if (ch=='/' && i>0 && input.charAt(i-1)=='<')
{
state = TAG_ILLEGAL_STATE;
}
return state;
}
/*
* in case the tag is parsed at the scan method
* this will return value of a parameter
* not implemented yet
* @param name of parameter
* @author Kaarle Kaila 23.10.2001
*/
public String getParameter(String name){
if (parsed == null) return null;
return (String)parsed.get(name.toUpperCase());
}
public Hashtable getParameters() {
return parsed;
}
/*
* in case the tag is parsed at the scan method
* this will return the tag-name (TAG)
* not implemented yet
* @author Kaarle Kaila 23.10.2001
*/
public String getTag(){
if (parsed == null) return null;
return (String)parsed.get(TAGNAME);
}
/**
* Insert the method's description here.
* Creation date: (6/6/2001 12:09:38 PM)
* @return java.lang.String
*/
public java.lang.String getTagLine() {
return tagLine;
}
/**
* Return the text contained in this tag
*/
public String getText()
{
return tagContents.toString();
}
/**
* Return the scanner associated with this tag.
*/
public HTMLTagScanner getThisScanner()
{
return thisScanner;
}
/**
* Method to break the tag into pieces.
* @param returns a Hastable with elements containing the
* pieces of the tag. The tag-name has the value field set to
* the constant HTMLTag.TAGNAME. In addition the tag-name is
* stored into the Hashtable with the name HTMLTag.TAGNAME
* where the value is the name of the tag.
* Tag parameters without value
* has the value "". Parameters with value are represented
* in the Hastable by a name/value pair.
* As html is case insensitive but Hastable is not are all
* names converted into UPPERCASE to the Hastable
* E.g extract the href values from A-tag's and print them
* <pre>
*
* HTMLTag tag;
* Hashtable h;
* String tmp;
* try {
* HTMLReader in = new HTMLReader(new FileReader(path),2048);
* HTMLParser p = new HTMLParser(in);
* Enumeration en = p.elements();
* while (en.hasMoreElements()) {
* try {
* tag = (HTMLTag)en.nextElement();
* h = tag.parseParameters();
* tmp = (String)h.get(tag.TAGNAME);
* if (tmp != null && tmp.equalsIgnoreCase("A")) {;
* System.out.println("URL is :" + h.get("HREF"));
* }
* } catch (ClassCastException ce){}
* }
* }
* catch (IOException ie) {
* ie.printStackTrace();
* }
* </pre>
*
* @author Kaarle Kaila
* @version 7 AUG 2001
*/
@SuppressWarnings("unchecked")
public Hashtable parseParameters(){
Hashtable h = new Hashtable();
String name,value,t,st;
final String delim = " \t\r\n\f=\"'>";
boolean isAmp=false;
boolean isApo=false;
boolean isValue=false;
boolean isName=true;
boolean waitingForValue = false;
name=null;
value=null;
t=null;
waitingForValue=false;
StringTokenizer token = new StringTokenizer(getText() + " ",delim,true);
while (token.hasMoreTokens()) {
st = token.nextToken();
//
// First let's combine tokens that are inside "" or ''
//
if (isAmp || isApo) {
if (isAmp && st.equals("\"")){
isAmp= false;
} else if (isApo && st.equals("'")) {
isApo=false;
}else {
t += st;
continue;
}
} else if (st.equals("\"")){
isAmp= true;
t = "";
continue;
} else if (st.equals("'")){
isApo=true;
t="";
continue;
} else t = st;
// above leaves t with
// - a delimter
// - a name of a parameter or the tag itself
// - a value of a parameter
if (delim.indexOf(t)>=0) {
// t was a delimiter
if (waitingForValue) {
if (t.equals("=")) {
// here set to receive next value of parameter
waitingForValue=false;
isValue=true;
value="";
}
}
if (name != null && isValue==false){
if (isName && value == null) value=TAGNAME;
else if (value==null) value = ""; // Hastable does not accept nulls
if (isName) {
// store tagname as tag.TAGNAME,tag
h.put(value,name.toUpperCase());
}
else {
// store tag parameters as NAME, value
h.put(name.toUpperCase(),value);
}
isName=false;
name=null;
name = null;
value=null;
}
}
else {
if (isValue) {
value=t;
isValue=false;
}
else {
if (name==null) {
name=t;
waitingForValue=true;
}
}
}
}
parsed = h;
return h;
}
/**
* Print the contents of the tag
*/
public void print()
{
System.out.println("Begin Tag : "+tagContents+"; begins at : "+elementBegin()+"; ends at : "+elementEnd());
}
public String toString() {
return "<" + getText() + ">";
}
public String composeNode() {
return "<" + getText() + ">";
}
/**
* Scan the tag to see using the registered scanners, and attempt identification.
* @param url URL at which HTML page is located
* @param reader The HTMLReader that is to be used for reading the url
*/
public HTMLNode scan(Enumeration scanners,String url,HTMLReader reader) throws IOException
{
//System.out.println("enter scan ..............:");
boolean found=false;
HTMLNode retVal=null;
for (Enumeration e=scanners;(e.hasMoreElements() && !found);)
{
HTMLTagScanner scanner = (HTMLTagScanner)e.nextElement();
// parsed = parseParameters();
// if (scanner.evaluate(this))
if (scanner.evaluate(tagContents.toString(),reader.getPreviousOpenScanner()))
{
found=true;
reader.setPreviousOpenScanner(scanner);
retVal=scanner.scan(this,url,reader,tagLine);
reader.setPreviousOpenScanner(null);
}
}
if (!found) return this;
else {
return retVal;
}
}
/**
* Insert the method's description here.
* Creation date: (6/6/2001 12:09:38 PM)
* @param newTagLine java.lang.String
*/
public void setTagLine(java.lang.String newTagLine) {
tagLine = newTagLine;
}
/**
* added by lhl.
* @param tagText
*/
public void setText(String tagText) {
tagContents = new StringBuffer(tagText);
}
/**
* Set the scanner associated with this tag
*/
public void setThisScanner(HTMLTagScanner scanner)
{
thisScanner = scanner;
}
/**
* Sets the tagEnd.
* @param tagEnd The tagEnd to set
*/
public void setTagEnd(int tagEnd) {
this.tagEnd = tagEnd;
}
/**
* Sets the tagBegin.
* @param tagBegin The tagBegin to set
*/
public void setTagBegin(int tagBegin) {
this.tagBegin = tagBegin;
}
public void append(char ch) {
tagContents.append(ch);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -