📄 lexer.java
字号:
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Derrick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v $// $Author: derrickoswald $// $Date: 2006/03/19 21:26:32 $// $Revision: 1.44 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.lexer;import java.io.Serializable;import java.net.MalformedURLException;import java.net.URLConnection;import java.util.Vector;import org.htmlparser.Node;import org.htmlparser.NodeFactory;import org.htmlparser.Remark;import org.htmlparser.Text;import org.htmlparser.Tag;import org.htmlparser.http.ConnectionManager;import org.htmlparser.nodes.RemarkNode;import org.htmlparser.nodes.TextNode;import org.htmlparser.nodes.TagNode;import org.htmlparser.util.ParserException;/** * This class parses the HTML stream into nodes. * There are three major types of nodes (lexemes): * <ul> * <li>Remark</li> * <li>Text</li> * <li>Tag</li> * </ul> * Each time <code>nextNode()</code> is called, another node is returned until * the stream is exhausted, and <code>null</code> is returned. */public class Lexer implements Serializable, NodeFactory{ /** * The page lexemes are retrieved from. */ protected Page mPage; /** * The current position on the page. */ protected Cursor mCursor; /** * The factory for new nodes. */ protected NodeFactory mFactory; /** * Line number to trigger on. * This is tested on each <code>nextNode()</code> call, as a debugging aid. * Alter this value and set a breakpoint on the guarded statement. * Remember, these line numbers are zero based, while most editors are * one based. * @see #nextNode */ protected static int mDebugLineTrigger = -1; /** * Creates a new instance of a Lexer. */ public Lexer () { this (new Page ("")); } /** * Creates a new instance of a Lexer. * @param page The page with HTML text. */ public Lexer (Page page) { setPage (page); setCursor (new Cursor (page, 0)); setNodeFactory (this); } /** * Creates a new instance of a Lexer. * @param text The text to parse. */ public Lexer (String text) { this (new Page (text)); } /** * Creates a new instance of a Lexer. * @param connection The url to parse. * @exception ParserException If an error occurs opening the connection. */ public Lexer (URLConnection connection) throws ParserException { this (new Page (connection)); } /** * Reset the lexer to start parsing from the beginning again. * The underlying components are reset such that the next call to * <code>nextNode()</code> will return the first lexeme on the page. */ public void reset () { getPage ().reset (); setCursor (new Cursor (getPage (), 0)); } /** * Get the page this lexer is working on. * @return The page that nodes are being read from. */ public Page getPage () { return (mPage); } /** * Set the page this lexer is working on. * @param page The page that nodes will be read from. */ public void setPage (Page page) { if (null == page) throw new IllegalArgumentException ("page cannot be null"); // todo: sanity checks mPage = page; } /** * Get the current scanning position. * @return The lexer's cursor position. */ public Cursor getCursor () { return (mCursor); } /** * Set the current scanning position. * @param cursor The lexer's new cursor position. */ public void setCursor (Cursor cursor) { if (null == cursor) throw new IllegalArgumentException ("cursor cannot be null"); // todo: sanity checks mCursor = cursor; } /** * Get the current node factory. * @return The lexer's node factory. */ public NodeFactory getNodeFactory () { return (mFactory); } /** * Set the current node factory. * @param factory The node factory to be used by the lexer. */ public void setNodeFactory (NodeFactory factory) { if (null == factory) throw new IllegalArgumentException ("node factory cannot be null"); mFactory = factory; } /** * Get the current cursor position. * @return The current character offset into the source. */ public int getPosition () { return (getCursor ().getPosition ()); } /** * Set the current cursor position. * @param position The new character offset into the source. */ public void setPosition (int position) { // todo: sanity checks getCursor ().setPosition (position); } /** * Get the current line number. * @return The line number the lexer's working on. */ public int getCurrentLineNumber () { return (getPage ().row (getCursor ())); } /** * Get the current line. * @return The string the lexer's working on. */ public String getCurrentLine () { return (getPage ().getLine (getCursor ())); } /** * Get the next node from the source. * @return A Remark, Text or Tag, or <code>null</code> if no * more lexemes are present. * @exception ParserException If there is a problem with the * underlying page. */ public Node nextNode () throws ParserException { return nextNode (false); } /** * Get the next node from the source. * @param quotesmart If <code>true</code>, strings ignore quoted contents. * @return A Remark, Text or Tag, or <code>null</code> if no * more lexemes are present. * @exception ParserException If there is a problem with the * underlying page. */ public Node nextNode (boolean quotesmart) throws ParserException { int start; char ch; Node ret; // debugging suppport if (-1 != mDebugLineTrigger) { Page page = getPage (); int lineno = page.row (mCursor); if (mDebugLineTrigger < lineno) mDebugLineTrigger = lineno + 1; // trigger on next line too } start = mCursor.getPosition (); ch = mPage.getCharacter (mCursor); switch (ch) { case Page.EOF: ret = null; break; case '<': ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) ret = makeString (start, mCursor.getPosition ()); else if ('%' == ch) { mCursor.retreat (); ret = parseJsp (start); } else if ('?' == ch) { mCursor.retreat (); ret = parsePI (start); } else if ('/' == ch || '%' == ch || Character.isLetter (ch)) { mCursor.retreat (); ret = parseTag (start); } else if ('!' == ch) { ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) ret = makeString (start, mCursor.getPosition ()); else { if ('>' == ch) // handle <!> ret = makeRemark (start, mCursor.getPosition ()); else { mCursor.retreat (); // remark/tag need this char if ('-' == ch) ret = parseRemark (start, quotesmart); else { mCursor.retreat (); // tag needs prior one too ret = parseTag (start); } } } } else ret = parseString (start, quotesmart); break; default: mCursor.retreat (); // string needs to see leading foreslash ret = parseString (start, quotesmart); break; } return (ret); } /** * Advance the cursor through a JIS escape sequence. * @param cursor A cursor positioned within the escape sequence. * @exception ParserException If a problem occurs reading from the source. */ protected void scanJIS (Cursor cursor) throws ParserException { boolean done; char ch; int state; done = false; state = 0; while (!done) { ch = mPage.getCharacter (cursor); if (Page.EOF == ch) done = true; else switch (state) { case 0: if (0x1b == ch) // escape state = 1; break; case 1: if ('(' == ch) state = 2; else state = 0; break; case 2: if ('J' == ch) done = true; else state = 0; break; default: throw new IllegalStateException ("state " + state); } } } /** * Parse a string node. * Scan characters until "</", "<%", "<!" or < followed by a * letter is encountered, or the input stream is exhausted, in which * case <code>null</code> is returned. * @param start The position at which to start scanning. * @param quotesmart If <code>true</code>, strings ignore quoted contents. * @return The parsed node. * @exception ParserException If a problem occurs reading from the source. */ protected Node parseString (int start, boolean quotesmart) throws ParserException { boolean done; char ch; char quote; done = false; quote = 0; while (!done) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -