htmlparser.jj

来自「HTML解释器JAVA源码」· JJ 代码 · 共 404 行
404 行
/* * HtmlParser.jj -- JavaCC grammar for HTML.   * Copyright (C) 1999 Quiotix Corporation.   * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License, version 2, as  * published by the Free Software Foundation.   * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt) * for more details. *//*  * JavaCC grammar file for HTML. *  * Author: Brian Goetz, Quiotix * Version: 1.01 * Revision: $Id: HtmlParser.jj,v 1.41 2000/11/10 02:50:54 brian Exp $ * * This grammar parses an HTML document and produces a (flat) parse "tree"  * representing the document.  It preserves almost all information in the * source document, including carriage control and spacing (except inside * of tags.)  See the HtmlDocument and HtmlDocument.* classes for a  * description of the parse tree.  The parse tree supports traversal using * the commonly used "Visitor" pattern.  The HtmlDumper class is a visitor * which dumps out the tree to an output stream.   *  * It does not require begin tags to be matched with end tags, or validate * the names or contents of the tags (this can easily be done post-parsing;  * see the HtmlCollector class (which matches begin tags with end tags)  * for an example.)   *  * Notable edge cases include:  * - Quoted string processing.  Quoted strings are matched inside of comments, and *   as tag attribute values.  Quoted strings are matched in normal text only *   to the extent that they do not span line breaks.   *  * Please direct comments, questions, gripes or praise to  * html-parser@quiotix.com.  If you like it/hate it/use it, please let us know!   */options { IGNORE_CASE = true; STATIC = false; } PARSER_BEGIN(HtmlParser)package com.quiotix.html.parser;public class HtmlParser {  final static String NL = System.getProperty("line.separator");  private static String getTokenText(Token first, Token cur) {    Token t;    StringBuffer sb = new StringBuffer();    for (t=first; t != cur.next; t = t.next) {      if (t.specialToken != null) {        Token tt=t.specialToken;        while (tt.specialToken != null)           tt = tt.specialToken;        for (; tt != null; tt = tt.next)           sb.append(tt.image);      };      sb.append(t.image);    };    return sb.toString();  }  public static void main(String[] args) throws ParseException {    HtmlParser parser = new HtmlParser(System.in);    HtmlDocument doc = parser.HtmlDocument();    doc.accept(new HtmlDumper(System.out));    System.exit(0);  }}PARSER_END(HtmlParser)<*> TOKEN :{  <#ALPHA_CHAR: ["a"-"z", "A"-"Z"] >| <#NUM_CHAR:   ["0"-"9"] >| <#ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >| <#IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", "-", "." ] >| <#IDENTIFIER: <ALPHA_CHAR> (<IDENTIFIER_CHAR>)* >| <#QUOTED_STRING_NB: ( "'" ( ~["'", "\r", "\n"] )* "'" )                     | ( "\"" ( ~["\"", "\r", "\n"] )* "\"" ) >| <#QUOTED_STRING: ( "'" ( ~["'"] )* "'" ) | ( "\"" ( ~["\""] )* "\"" ) >| <#WHITESPACE: ( " " | "\t" | "\n" | "\r" ) >| <#NEWLINE:    ( "\r\n" | "\r" | "\n" ) >| <#QUOTE:      ( "'" | "\"" )>}<DEFAULT> TOKEN : {  <EOL:            ( " " | "\t" )* <NEWLINE> >| <TAG_START:      "<"       > : LexStartTag| <ENDTAG_START:   "</"      > : LexStartTag| <COMMENT_START:  "<!--"    > : LexComment| <DECL_START:     "<!"      > : LexDecl| <PCDATA:         ( ~["'", "\"", "<", "\r", "\n"] )+ >| <PCDATA_QS:      <QUOTED_STRING_NB> >| <PCDATA_Q:       <QUOTE> >}<LexStartTag> TOKEN :{  <TAG_SCRIPT: "SCRIPT">    : LexInTag| <TAG_STYLE:  "STYLE">     : LexInTag| <TAG_NAME: <IDENTIFIER> > : LexInTag| <LST_ERROR: ~[]>          : DEFAULT}<LexInTag> SPECIAL_TOKEN :{  < (<WHITESPACE>)+ >}<LexInTag> TOKEN :{  <ATTR_NAME: <IDENTIFIER> >| <TAG_END: ">" >                       : DEFAULT| <TAG_SLASHEND: "/>" >                 : DEFAULT| <ATTR_EQ: "=" >                       : LexAttrVal| <IMPLICIT_TAG_END: "<">                 {     Token t = new Token();    t.image       = "<";    t.kind        = TAG_START;    t.next        = matchedToken.next;    t.beginLine   = matchedToken.beginLine;    t.beginColumn = matchedToken.beginColumn;    t.endLine     = matchedToken.endLine;    t.endColumn   = matchedToken.endColumn;    matchedToken.next  = t;    matchedToken.kind  = TAG_END;    matchedToken.image = ">";  }                                     : LexStartTag| <LIT_ERROR: ~[]>}  <LexAttrVal> SPECIAL_TOKEN :{  < <WHITESPACE> >}<LexAttrVal> TOKEN :{  <ATTR_VAL: <QUOTED_STRING> | ( ~[">", "\"", "'", " ", "\t", "\n", "\r"] )+ > : LexInTag| <LAV_ERROR: ~[]>}<LexComment> TOKEN : {  < COMMENT_END:  ("--" (" ")* ">" | "->") > : DEFAULT | < DASH:         "-" >| < COMMENT_EOL:  <NEWLINE> >| < COMMENT_WORD: ( (~[ "\n", "\r", "'", "\"", "-" ])+                     | <QUOTED_STRING_NB>                     | <QUOTE> ) >}<LexDecl> TOKEN :{  <DECL_ANY: ( <QUOTED_STRING> | ~[ ">", "\"", "'" ] )+ >| <DECL_END: ">" > : DEFAULT}<LexScript> TOKEN : {  <SCRIPT_END:   "</SCRIPT>" > : DEFAULT}<LexStyle> TOKEN : {  <STYLE_END:    "</STYLE>" > : DEFAULT}<LexScript, LexStyle> TOKEN :{  <BLOCK_EOL:    <NEWLINE> >| <BLOCK_LBR:    "<" >| <BLOCK_WORD:   ( <QUOTED_STRING_NB>                    | <QUOTE>                    | (~[ "\n", "\r", "'", "\"", "<"])+ ) >}HtmlDocument HtmlDocument() :{  HtmlDocument.ElementSequence s;}{  s=ElementSequence() <EOF>   { return new HtmlDocument(s); }}HtmlDocument.ElementSequence ElementSequence() :{  HtmlDocument.ElementSequence s = new HtmlDocument.ElementSequence();  HtmlDocument.HtmlElement h;}{  ( h=Element() { s.addElement(h); } ) *  { return s; }}HtmlDocument.HtmlElement Element() :{  HtmlDocument.HtmlElement e;  Token text;}{(    LOOKAHEAD(2)          e = Tag()        { return e; }  |      e = EndTag()     { return e; }  |      e = CommentTag() { return e; }  |      e = DeclTag()    { return e; }  | LOOKAHEAD(2)          e = ScriptBlock() { return e; }  | LOOKAHEAD(2)          e = StyleBlock()  { return e; }  | LOOKAHEAD(2)              <TAG_START> text=<LST_ERROR>                          { return new HtmlDocument.Text("<" + text.image); }  |   text = <PCDATA>     { return new HtmlDocument.Text(text.image); }  |   text = <PCDATA_QS>  { return new HtmlDocument.Text(text.image); }  |   text = <PCDATA_Q>   { return new HtmlDocument.Text(text.image); }  |          <EOL>        { return new HtmlDocument.Newline(); })}HtmlDocument.Attribute Attribute() : {  HtmlDocument.Attribute a;   Token t1, t2=null;}{  t1=<ATTR_NAME> [ <ATTR_EQ> t2=<ATTR_VAL> ]  { if (t2 == null)       return new HtmlDocument.Attribute(t1.image);     else      return new HtmlDocument.Attribute(t1.image, t2.image);  }}HtmlDocument.AttributeList AttributeList() : {  HtmlDocument.AttributeList alist = new HtmlDocument.AttributeList();  HtmlDocument.Attribute a;}{  (a=Attribute() { alist.addAttribute(a); } )*  { return alist; }}HtmlDocument.HtmlElement Tag() :{  Token t, et;  HtmlDocument.AttributeList alist;  Token firstToken = getToken(1);}{  try {    <TAG_START> t=<TAG_NAME> alist=AttributeList()     ( et=<TAG_END> | et=<TAG_SLASHEND> )    {       HtmlDocument.Tag tag = new HtmlDocument.Tag(t.image, alist);       if (et.kind == TAG_SLASHEND) tag.setEmpty(true);      return tag;    }  }  catch (ParseException ex) {    token_source.SwitchTo(DEFAULT);     String s = getTokenText(firstToken, getNextToken());    return new HtmlDocument.Text(s);  }}HtmlDocument.ElementSequence BlockContents() : {  Token t;  StringBuffer s = new StringBuffer();  HtmlDocument.ElementSequence e = new HtmlDocument.ElementSequence();}{  ( <BLOCK_EOL> {       if (s.length() > 0) {        e.addElement(new HtmlDocument.Text(s.toString()));         s.setLength(0);      };      e.addElement(new HtmlDocument.Newline());     }    | t=<BLOCK_WORD> { s.append(t.image); }     | t=<BLOCK_LBR>  { s.append(t.image); }   )*   {     if (s.length() > 0)       e.addElement(new HtmlDocument.Text(s.toString()));     e.addElement(new HtmlDocument.Newline());     return e;  }}HtmlDocument.HtmlElement ScriptBlock() :{  HtmlDocument.AttributeList alist;  HtmlDocument.ElementSequence e;  Token firstToken = getToken(1);}{  try {    <TAG_START> <TAG_SCRIPT> alist=AttributeList() <TAG_END>    {       token_source.SwitchTo(LexScript);     }    e=BlockContents()    <SCRIPT_END>    {       return new HtmlDocument.TagBlock("SCRIPT", alist, e);     }  }  catch (ParseException ex) {    token_source.SwitchTo(DEFAULT);     String s = getTokenText(firstToken, getNextToken());    return new HtmlDocument.Text(s);  }}HtmlDocument.HtmlElement StyleBlock() :{  HtmlDocument.AttributeList alist;  HtmlDocument.ElementSequence e;  Token firstToken = getToken(1);}{  try {    <TAG_START> <TAG_STYLE> alist=AttributeList() <TAG_END>    {       token_source.SwitchTo(LexStyle);     }    e=BlockContents()    <STYLE_END>    {       return new HtmlDocument.TagBlock("STYLE", alist, e);     }  }  catch (ParseException ex) {    token_source.SwitchTo(DEFAULT);     String s = getTokenText(firstToken, getNextToken());    return new HtmlDocument.Text(s);  }}HtmlDocument.HtmlElement EndTag() :{  Token t;  Token firstToken = getToken(1);}{  try {    <ENDTAG_START> t=<TAG_NAME> <TAG_END>    { return new HtmlDocument.EndTag(t.image); }  }  catch (ParseException ex) {    token_source.SwitchTo(DEFAULT);     String s = getTokenText(firstToken, getNextToken());    return new HtmlDocument.Text(s);  }}HtmlDocument.Comment CommentTag() :{  Token t;  StringBuffer s = new StringBuffer("--");}{  <COMMENT_START>  ( t=<DASH> { s.append(t.image); }     | <COMMENT_EOL>  { s.append(NL); }    | t=<COMMENT_WORD> { s.append(t.image); } )*   (<EOF> | <COMMENT_END>)  { return new HtmlDocument.Comment(s.append("--").toString()); }}  HtmlDocument.Comment DeclTag() :{  Token t;}{  <DECL_START> t=<DECL_ANY> <DECL_END>  {     return new HtmlDocument.Comment(t.image);   }}
htmlparser.jj - 源码说明

本页面展示了「HTML解释器JAVA源码」中的 htmlparser.jj 源码文件，采用 JJ 编程语言编写，共 404 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与HTML相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?