📄 parser.java

📁 linux下编程用编译软件
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* Parser.java -- HTML parser.   Copyright (C) 2005 Free Software Foundation, Inc.This file is part of GNU Classpath.GNU Classpath is free software; you can redistribute it and/or modifyit under the terms of the GNU General Public License as published bythe Free Software Foundation; either version 2, or (at your option)any later version.GNU Classpath is distributed in the hope that it will be useful, butWITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNUGeneral Public License for more details.You should have received a copy of the GNU General Public Licensealong with GNU Classpath; see the file COPYING.  If not, write to theFree Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA02110-1301 USA.Linking this library statically or dynamically with other modules ismaking a combined work based on this library.  Thus, the terms andconditions of the GNU General Public License cover the wholecombination.As a special exception, the copyright holders of this library give youpermission to link this library with independent modules to produce anexecutable, regardless of the license terms of these independentmodules, and to copy and distribute the resulting executable underterms of your choice, provided that you also meet, for each linkedindependent module, the terms and conditions of the license of thatmodule.  An independent module is a module which is not derived fromor based on this library.  If you modify this library, you may extendthis exception to your version of the library, but you are notobligated to do so.  If you do not wish to do so, delete thisexception statement from your version. */package gnu.javax.swing.text.html.parser.support;import gnu.javax.swing.text.html.parser.htmlAttributeSet;import gnu.javax.swing.text.html.parser.htmlValidator;import gnu.javax.swing.text.html.parser.support.low.Constants;import gnu.javax.swing.text.html.parser.support.low.ParseException;import gnu.javax.swing.text.html.parser.support.low.ReaderTokenizer;import gnu.javax.swing.text.html.parser.support.low.Token;import gnu.javax.swing.text.html.parser.support.low.node;import gnu.javax.swing.text.html.parser.support.low.pattern;import java.io.IOException;import java.io.Reader;import java.util.Comparator;import java.util.Set;import java.util.TreeSet;import java.util.Vector;import javax.swing.text.ChangedCharSetException;import javax.swing.text.html.HTML;import javax.swing.text.html.parser.AttributeList;import javax.swing.text.html.parser.DTD;import javax.swing.text.html.parser.DTDConstants;import javax.swing.text.html.parser.Element;import javax.swing.text.html.parser.Entity;import javax.swing.text.html.parser.TagElement;/** * <p>A simple error-tolerant HTML parser that uses a DTD document * to access data on the possible tokens, arguments and syntax.</p> * <p> The parser reads an HTML content from a Reader and calls various * notifying methods (which should be overridden in a subclass) * when tags or data are encountered.</p> * <p>Some HTML elements need no opening or closing tags. The * task of this parser is to invoke the tag handling methods also when * the tags are not explicitly specified and must be supposed using * information, stored in the DTD. * For  example, parsing the document * <p>&lt;table&gt;&lt;tr&gt;&lt;td&gt;a&lt;td&gt;b&lt;td&gt;c&lt;/tr&gt; <br> * will invoke exactly the handling methods exactly in the same order * (and with the same parameters) as if parsing the document: <br> * <em>&lt;html&gt;&lt;head&gt;&lt;/head&gt;&lt;body&gt;&lt;table&gt;&lt; * tbody&gt;</em>&lt;tr&gt;&lt;td&gt;a<em>&lt;/td&gt;</em>&lt;td&gt;b<em> * &lt;/td&gt;</em>&lt;td&gt;c<em>&lt;/td&gt;&lt;/tr&gt;</em>&lt; * <em>/tbody&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;</em></p> * (supposed tags are given in italics). The parser also supports * obsolete elements of HTML syntax.<p> * </p> * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) */public class Parser  extends ReaderTokenizer  implements DTDConstants{  /**   * The current html tag.   */  public Token hTag = new Token();  /**   * The document template description that will be used to parse the documents.   */  protected DTD dtd;  /**   * The value of this field determines whether or not the Parser will be   * strict in enforcing SGML compatibility. The default value is false,   * stating that the parser should do everything to parse and get at least   * some information even from the incorrectly written HTML input.   */  protected boolean strict;  /**   * This fields has positive values in preformatted tags.   */  protected int preformatted = 0;  /**   * The set of the document tags. This field is used for supporting   * markFirstTime().   */  private Set documentTags =    new TreeSet(new Comparator()      {        public int compare(Object a, Object b)        {          return ((String) a).compareToIgnoreCase((String) b);        }      }               );  /**  * The buffer to collect the incremental output like text or coment.  */  private StringBuffer buffer = new StringBuffer();  /**   * The buffer to store the document title.   */  private StringBuffer title = new StringBuffer();  /**   * The current token.   */  private Token t;  /**   * True means that the 'title' tag of this document has   * already been handled.   */  private boolean titleHandled;  /**   * True means that the 'title' tag is currently open and all   * text is also added to the title buffer.   */  private boolean titleOpen;  /**   * The attributes of the current HTML element.   * Package-private to avoid an accessor method.   */  htmlAttributeSet attributes =    htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;  /**   * The validator, controlling the forcible closing of the tags that   * (in accordance to dtd) are not allowed in the current context.   */  private htmlValidator validator;  /**   * Provides the default values for parameters in the case when these   * values are defined in the DTD.   */  private parameterDefaulter defaulter;  /**   * The text pre-processor for handling line ends and tabs.   */  private textPreProcessor textProcessor = new textPreProcessor();  /**   * Creates a new Parser that uses the given   * {@link javax.swing.text.html.parser.DTD }. The only standard way   * to get an instance of DTD is to construct it manually, filling in   * all required fields.   * @param a_dtd The DTD to use. The parser behaviour after passing null   * as an argument is not documented and may vary between implementations.   */  public Parser(DTD a_dtd)  {    if (a_dtd == null)      dtd = gnu.javax.swing.text.html.parser.HTML_401F.getInstance();    else      dtd = a_dtd;    defaulter = new parameterDefaulter(dtd);    validator =      new htmlValidator(dtd)        {          /**           * Handles the error message. This method must be overridden to pass           * the message where required.           * @param msg The message text.           */          protected void s_error(String msg)          {            error(msg);          }          /**           * The method is called when the tag validator decides to close the           * tag on its own initiative. After reaching the end of stream,           * The tag validator closes all unclosed elements that are required           * to have the end (closing) tag.           *           * @param element The tag being fictionally (forcibly) closed.           */          protected void handleSupposedEndTag(Element tElement)          {            // The tag is cloned as the original tElement is the            // element from the starting tag - may be accidently used            // somewhere else.            TagElement tag = makeTag(tElement, true);            _handleEndTag_remaining(tag);          }          /**           * The method is called when the the tag validator decides to open           * the new tag on its own initiative. The tags, opened in this           * way, are HTML, HEAD and BODY. The attribute set is temporary           * assigned to the empty one, the previous value is           * restored before return.           *           * @param element The tag being fictionally (forcibly) closed.           */          protected void handleSupposedStartTag(Element tElement)          {            TagElement tag = makeTag(tElement, true);            htmlAttributeSet were = attributes;            attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;            _handleStartTag(tag);            attributes = were;          }        };  }  /**   * Get the attributes of the current tag.   * @return The attribute set, representing the attributes of the current tag.   */  public htmlAttributeSet getAttributes()  {    return attributes;  }  /**   * Invokes the error handler. The default method in this implementation   * delegates the call to handleError, also providing the current line.   */  public void error(String msg)  {    error(msg, getTokenAhead());  }  public void error(String msg, Token atToken)  {    if (atToken != null)      handleError(atToken.where.beginLine,                  msg + ": line " + atToken.where.beginLine +                  ", absolute pos " + atToken.where.startPosition                 );    else      handleError(0, msg);  }  /**   * Invokes the error handler. The default method in this implementation   * delegates the call to error (parm1+": '"+parm2+"'").   */  public void error(String msg, String invalid)  {    error(msg + ": '" + invalid + "'");  }  /**   * Invokes the error handler. The default method in this implementation   * delegates the call to error (parm1+" "+ parm2+" "+ parm3).   */  public void error(String parm1, String parm2, String parm3)  {    error(parm1 + " " + parm2 + " " + parm3);  }  /**   * Invokes the error handler. The default method in this implementation   * delegates the call to error (parm1+" "+ parm2+" "+ parm3+" "+ parm4).   */  public void error(String parm1, String parm2, String parm3, String parm4)  {    error(parm1 + " " + parm2 + " " + parm3 + " " + parm4);  }  public void flushAttributes()  {  }  /**   * Parse the HTML text, calling various methods in response to the   * occurence of the corresponding HTML constructions.   * @param reader The reader to read the source HTML from.   * @throws IOException If the reader throws one.   */  public synchronized void parse(Reader reader)                          throws IOException  {    reset(reader);    restart();    try      {        parseDocument();        validator.closeAll();      }    catch (ParseException ex)      {        if (ex != null)          {            error("Unable to continue parsing the document", ex.getMessage());            Throwable cause = ex.getCause();            if (cause instanceof IOException)              throw (IOException) cause;          }      }  }  /**   * Parses DTD markup declaration. Currently returns null without action.   * @return null.   * @throws IOException   */  public String parseDTDMarkup()                        throws IOException  {    return null;  }  /**   * Parse SGML insertion ( &lt;! ... &gt; ). When the   * the SGML insertion is found, this method is called, passing   * SGML in the string buffer as a parameter. The default method   * returns false without action and can be overridden to   * implement user - defined SGML support.   * <p>   * If you need more information about SGML insertions in HTML documents,   * the author suggests to read SGML tutorial on   * {@link http://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html}.   * We also recommend Goldfarb C.F (1991) <i>The SGML Handbook</i>,   * Oxford University Press, 688 p, ISBN: 0198537379.   * </p>   * @param strBuff   * @return true if this is a valid DTD markup declaration.   * @throws IOException   */  public boolean parseMarkupDeclarations(StringBuffer strBuff)                                  throws IOException  {    return false;  }  /**   * Get the first line of the last parsed token.   */  protected int getCurrentLine()  {    return hTag.where.beginLine;  }  /**   * Read parseable character data, add to buffer.   * @param clearBuffer If true, buffer if filled by CDATA section,   * otherwise the section is appended to the existing content of the   * buffer.   *   * @throws ParseException   */  protected void CDATA(boolean clearBuffer)                throws ParseException  {    Token start = hTag = getTokenAhead();    if (clearBuffer)      buffer.setLength(0);    // Handle expected EOF.    if (start.kind == EOF)      return;    read:     while (true)      {        t = getTokenAhead();        if (t.kind == EOF)          {            error("unexpected eof", t);            break read;          }        else if (t.kind == BEGIN)          break read;        else if (t.kind == Constants.ENTITY)          {            resolveAndAppendEntity(t);            getNextToken();          }        else          {            append(t);            getNextToken();          }      }    hTag = new Token(start, getTokenAhead(0));    if (buffer.length() != 0)      _handleText();  }  /**  * Process Comment. This method skips till --> without  * taking SGML constructs into consideration.  The supported SGML  * constructs are handled separately.  */  protected void Comment()                  throws ParseException  {    buffer.setLength(0);    Token start = hTag = mustBe(BEGIN);    optional(WS);    mustBe(EXCLAMATION);    optional(WS);    mustBe(DOUBLE_DASH);    Token t;    Token last;    comment:     while (true)      {        t = getTokenAhead();        if (t.kind == EOF)          {            handleEOFInComment();            last = t;            break comment;          }        else if (COMMENT_END.matches(this))          {            mustBe(DOUBLE_DASH);            optional(WS);            last = mustBe(END);            break comment;          }        else if (COMMENT_TRIPLEDASH_END.matches(this))          {            mustBe(DOUBLE_DASH);            t = mustBe(NUMTOKEN);            if (t.getImage().equals("-"))              {                append(t);                last = mustBe(END);                break comment;              }            else              {                buffer.append("--");                append(t);                t = getTokenAhead();              }          }        else        /* The lllll-- can match as NUMTOKEN */        if ((t.getImage().endsWith("--")) &&
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -