📄 lexer.java
字号:
// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Derrick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v $// $Author: derrickoswald $// $Date: 2006/06/10 15:11:32 $// $Revision: 1.50 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.lexer;import java.io.Serializable;import java.net.MalformedURLException;import java.net.URLConnection;import java.util.Vector;import org.htmlparser.Node;import org.htmlparser.NodeFactory;import org.htmlparser.Remark;import org.htmlparser.Text;import org.htmlparser.Tag;import org.htmlparser.http.ConnectionManager;import org.htmlparser.nodes.RemarkNode;import org.htmlparser.nodes.TextNode;import org.htmlparser.nodes.TagNode;import org.htmlparser.util.ParserException;/** * This class parses the HTML stream into nodes. * There are three major types of nodes (lexemes): * <ul> * <li>Remark</li> * <li>Text</li> * <li>Tag</li> * </ul> * Each time <code>nextNode()</code> is called, another node is returned until * the stream is exhausted, and <code>null</code> is returned. */public class Lexer implements Serializable, NodeFactory{ // Please don't change the formatting of the version variables below. // This is done so as to facilitate ant script processing. /** * The floating point version number ({@value}). */ public static final double VERSION_NUMBER = 1.6 ; /** * The type of version ({@value}). */ public static final String VERSION_TYPE = "Release Build" ; /** * The date of the version ({@value}). */ public static final String VERSION_DATE = "Jun 10, 2006" ; // End of formatting /** * The display version ({@value}). */ public static final String VERSION_STRING = "" + VERSION_NUMBER + " (" + VERSION_TYPE + " " + VERSION_DATE + ")"; /** * Process remarks strictly flag. * If <code>true</code>, remarks are not terminated by ---$gt; * or --!$gt;, i.e. more than two dashes. If <code>false</code>, * a more lax (and closer to typical browser handling) remark parsing * is used. * Default <code>true</code>. */ public static boolean STRICT_REMARKS = true; /** * The page lexemes are retrieved from. */ protected Page mPage; /** * The current position on the page. */ protected Cursor mCursor; /** * The factory for new nodes. */ protected NodeFactory mFactory; /** * Line number to trigger on. * This is tested on each <code>nextNode()</code> call, as a debugging aid. * Alter this value and set a breakpoint on the guarded statement. * Remember, these line numbers are zero based, while most editors are * one based. * @see #nextNode */ protected static int mDebugLineTrigger = -1; // // Static methods // /** * Return the version string of this parser. * @return A string of the form: * <pre> * "[floating point number] ([build-type] [build-date])" * </pre> */ public static String getVersion () { return (VERSION_STRING); } // // Constructors // /** * Creates a new instance of a Lexer. */ public Lexer () { this (new Page ("")); } /** * Creates a new instance of a Lexer. * @param page The page with HTML text. */ public Lexer (Page page) { setPage (page); setCursor (new Cursor (page, 0)); setNodeFactory (this); } /** * Creates a new instance of a Lexer. * @param text The text to parse. */ public Lexer (String text) { this (new Page (text)); } /** * Creates a new instance of a Lexer. * @param connection The url to parse. * @exception ParserException If an error occurs opening the connection. */ public Lexer (URLConnection connection) throws ParserException { this (new Page (connection)); } // // Bean patterns // /** * Get the page this lexer is working on. * @return The page that nodes are being read from. */ public Page getPage () { return (mPage); } /** * Set the page this lexer is working on. * @param page The page that nodes will be read from. */ public void setPage (Page page) { if (null == page) throw new IllegalArgumentException ("page cannot be null"); // todo: sanity checks mPage = page; } /** * Get the current scanning position. * @return The lexer's cursor position. */ public Cursor getCursor () { return (mCursor); } /** * Set the current scanning position. * @param cursor The lexer's new cursor position. */ public void setCursor (Cursor cursor) { if (null == cursor) throw new IllegalArgumentException ("cursor cannot be null"); // todo: sanity checks mCursor = cursor; } /** * Get the current node factory. * @return The lexer's node factory. */ public NodeFactory getNodeFactory () { return (mFactory); } /** * Set the current node factory. * @param factory The node factory to be used by the lexer. */ public void setNodeFactory (NodeFactory factory) { if (null == factory) throw new IllegalArgumentException ("node factory cannot be null"); mFactory = factory; } /** * Get the current cursor position. * @return The current character offset into the source. */ public int getPosition () { return (getCursor ().getPosition ()); } /** * Set the current cursor position. * @param position The new character offset into the source. */ public void setPosition (int position) { // todo: sanity checks getCursor ().setPosition (position); } /** * Get the current line number. * @return The line number the lexer's working on. */ public int getCurrentLineNumber () { return (getPage ().row (getCursor ())); } /** * Get the current line. * @return The string the lexer's working on. */ public String getCurrentLine () { return (getPage ().getLine (getCursor ())); } // // Public methods // /** * Reset the lexer to start parsing from the beginning again. * The underlying components are reset such that the next call to * <code>nextNode()</code> will return the first lexeme on the page. */ public void reset () { getPage ().reset (); setCursor (new Cursor (getPage (), 0)); } /** * Get the next node from the source. * @return A Remark, Text or Tag, or <code>null</code> if no * more lexemes are present. * @exception ParserException If there is a problem with the * underlying page. */ public Node nextNode () throws ParserException { return nextNode (false); } /** * Get the next node from the source. * @param quotesmart If <code>true</code>, strings ignore quoted contents. * @return A Remark, Text or Tag, or <code>null</code> if no * more lexemes are present. * @exception ParserException If there is a problem with the * underlying page. */ public Node nextNode (boolean quotesmart) throws ParserException { int start; char ch; Node ret; // debugging suppport if (-1 != mDebugLineTrigger) { Page page = getPage (); int lineno = page.row (mCursor); if (mDebugLineTrigger < lineno) mDebugLineTrigger = lineno + 1; // trigger on next line too } start = mCursor.getPosition (); ch = mPage.getCharacter (mCursor); switch (ch) { case Page.EOF: ret = null; break; case '<': ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) ret = makeString (start, mCursor.getPosition ()); else if ('%' == ch) { mPage.ungetCharacter (mCursor); ret = parseJsp (start); } else if ('?' == ch) { mPage.ungetCharacter (mCursor); ret = parsePI (start); } else if ('/' == ch || '%' == ch || Character.isLetter (ch)) { mPage.ungetCharacter (mCursor); ret = parseTag (start); } else if ('!' == ch) { ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) ret = makeString (start, mCursor.getPosition ()); else { if ('>' == ch) // handle <!> ret = makeRemark (start, mCursor.getPosition ()); else { mPage.ungetCharacter (mCursor); // remark/tag need this char if ('-' == ch) ret = parseRemark (start, quotesmart); else { mPage.ungetCharacter (mCursor); // tag needs prior one too ret = parseTag (start); } } } } else ret = parseString (start, quotesmart); break; default: mPage.ungetCharacter (mCursor); // string needs to see leading foreslash ret = parseString (start, quotesmart); break; } return (ret); } /** * Return CDATA as a text node. * According to appendix <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data"> * B.3.2 Specifying non-HTML data</a> of the * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a>:<br> * <quote> * <b>Element content</b><br> * When script or style data is the content of an element (SCRIPT and STYLE), * the data begins immediately after the element start tag and ends at the * first ETAGO ("</") delimiter followed by a name start character ([a-zA-Z]); * note that this may not be the element's end tag. * Authors should therefore escape "</" within the content. Escape mechanisms * are specific to each scripting or style sheet language. * </quote> * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none. * @exception ParserException If a problem occurs reading from the source. */ public Node parseCDATA ()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -