📄 htmlparser.jj

📁 lucene实现全文检索的实际小例子,可以实现对文本文件的检索,和对内容的查询.! lucene实现全文检索的实际小例子,可以实现对文本文件的检索,和对内容的查询.!
💻 JJ
字号:
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */// HTMLParser.jjoptions {  STATIC = false;  OPTIMIZE_TOKEN_MANAGER = true;  //DEBUG_LOOKAHEAD = true;  //DEBUG_TOKEN_MANAGER = true;}PARSER_BEGIN(HTMLParser)package org.apache.lucene.demo.html;import java.io.*;import java.util.Properties;public class HTMLParser {  public static int SUMMARY_LENGTH = 200;  StringBuffer title = new StringBuffer(SUMMARY_LENGTH);  StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);  Properties metaTags=new Properties();  String currentMetaTag=null;  String currentMetaContent=null;  int length = 0;  boolean titleComplete = false;  boolean inTitle = false;  boolean inMetaTag = false;  boolean inStyle = false;  boolean afterTag = false;  boolean afterSpace = false;  String eol = System.getProperty("line.separator");  Reader pipeIn = null;  Writer pipeOut;  private MyPipedInputStream pipeInStream = null;  private PipedOutputStream pipeOutStream = null;    private class MyPipedInputStream extends PipedInputStream{        public MyPipedInputStream(){      super();    }        public MyPipedInputStream(PipedOutputStream src) throws IOException{      super(src);    }        public boolean full() throws IOException{      return this.available() >= PipedInputStream.PIPE_SIZE;    }  }  /**   * @deprecated Use HTMLParser(FileInputStream) instead   */  public HTMLParser(File file) throws FileNotFoundException {    this(new FileInputStream(file));  }  public String getTitle() throws IOException, InterruptedException {    if (pipeIn == null)      getReader();				  // spawn parsing thread    while (true) {      synchronized(this) {	if (titleComplete || pipeInStream.full())	  break;	wait(10);      }    }    return title.toString().trim();  }  public Properties getMetaTags() throws IOException,InterruptedException {    if (pipeIn == null)      getReader();				  // spawn parsing thread    while (true) {      synchronized(this) {	if (titleComplete || pipeInStream.full())	  break;	wait(10);      }    }    return metaTags;  }  public String getSummary() throws IOException, InterruptedException {    if (pipeIn == null)      getReader();				  // spawn parsing thread    while (true) {      synchronized(this) {	if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())	  break;	wait(10);      }    }    if (summary.length() > SUMMARY_LENGTH)      summary.setLength(SUMMARY_LENGTH);    String sum = summary.toString().trim();    String tit = getTitle();    if (sum.startsWith(tit) || sum.equals(""))      return tit;    else      return sum;  }  public Reader getReader() throws IOException {    if (pipeIn == null) {      pipeInStream = new MyPipedInputStream();      pipeOutStream = new PipedOutputStream(pipeInStream);      pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");      pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");      Thread thread = new ParserThread(this);      thread.start();				  // start parsing    }    return pipeIn;  }  void addToSummary(String text) {    if (summary.length() < SUMMARY_LENGTH) {      summary.append(text);      if (summary.length() >= SUMMARY_LENGTH) {	synchronized(this) {	  notifyAll();	}      }    }  }  void addText(String text) throws IOException {    if (inStyle)      return;    if (inTitle)      title.append(text);    else {      addToSummary(text);      if (!titleComplete && !title.equals("")) {  // finished title	synchronized(this) {	  titleComplete = true;			  // tell waiting threads	  notifyAll();	}      }    }    length += text.length();    pipeOut.write(text);    afterSpace = false;  }    void addMetaTag() {      metaTags.setProperty(currentMetaTag, currentMetaContent);      currentMetaTag = null;      currentMetaContent = null;      return;  }  void addSpace() throws IOException {    if (!afterSpace) {      if (inTitle)	title.append(" ");      else	addToSummary(" ");      String space = afterTag ? eol : " ";      length += space.length();      pipeOut.write(space);      afterSpace = true;    }  }//    void handleException(Exception e) {//      System.out.println(e.toString());  // print the error message//      System.out.println("Skipping...");//      Token t;//      do {//        t = getNextToken();//      } while (t.kind != TagEnd);//    }}PARSER_END(HTMLParser)void HTMLDocument() throws IOException :{  Token t;}{//  try {    ( Tag()         { afterTag = true; }    | t=Decl()      { afterTag = true; }    | CommentTag()  { afterTag = true; }    | ScriptTag()  { afterTag = true; }    | t=<Word>      { addText(t.image); afterTag = false; }    | t=<Entity>    { addText(Entities.decode(t.image)); afterTag = false; }    | t=<Punct>     { addText(t.image); afterTag = false; }    | <Space>       { addSpace(); afterTag = false; }    )* <EOF>//  } catch (ParseException e) {//    handleException(e);//  }}void Tag() throws IOException :{  Token t1, t2;  boolean inImg = false;}{  t1=<TagName> {   String tagName = t1.image.toLowerCase();   if(Tags.WS_ELEMS.contains(tagName) ) {      addSpace();    }    inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>    inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>    inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>    inImg = tagName.equalsIgnoreCase("<img");	  // keep track if in <IMG>  }  (t1=<ArgName>   (<ArgEquals>    (t2=ArgValue()				  // save ALT text in IMG tag     {       if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)         addText("[" + t2.image + "]");    	if(inMetaTag &&			(  t1.image.equalsIgnoreCase("name") ||			   t1.image.equalsIgnoreCase("HTTP-EQUIV")			)	   && t2 != null)	{		currentMetaTag=t2.image.toLowerCase();		if(currentMetaTag != null && currentMetaContent != null) {        	addMetaTag();		}	}    	if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=null)	{		currentMetaContent=t2.image.toLowerCase();		if(currentMetaTag != null && currentMetaContent != null) {        	addMetaTag();		}	}     }    )?   )?  )*  <TagEnd>}Token ArgValue() :{  Token t = null;}{  t=<ArgValue>                              { return t; }| LOOKAHEAD(2)  <ArgQuote1> <CloseQuote1>                 { return t; }| <ArgQuote1> t=<Quote1Text> <CloseQuote1>  { return t; }| LOOKAHEAD(2)  <ArgQuote2> <CloseQuote2>                 { return t; }| <ArgQuote2> t=<Quote2Text> <CloseQuote2>  { return t; }}Token Decl() :{  Token t;}{  t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>  { return t; }}void CommentTag() :{}{  (<Comment1> ( <CommentText1> )* <CommentEnd1>) |  (<Comment2> ( <CommentText2> )* <CommentEnd2>)}void ScriptTag() :{}{  <ScriptStart> ( <ScriptText> )* <ScriptEnd>}TOKEN :{  < ScriptStart: "<script" > : WithinScript| < TagName:  "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag| < DeclName: "<"  "!"   ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag| < Comment1:  "<!--" > : WithinComment1| < Comment2:  "<!" >   : WithinComment2| < Word:     ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] |                <LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM> )+ >| < #LET:     ["A"-"Z","a"-"z","0"-"9"] >| < #NUM:     ["0"-"9"] >| < #HEX:     ["0"-"9","A"-"F","a"-"f"] >| < Entity:   ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")? | "&" "#" ["X","x"] (<HEX>)+ (";")? ) >| < Space:    (<SP>)+ >| < #SP:      [" ","\t","\r","\n"] >| < Punct:    ~[] > // Keep this last.  It is a catch-all.}<WithinScript> TOKEN:{  < ScriptText:  (~["<",">"])+ | "<" | ">" >| < ScriptEnd: "</script" (~["<",">"])* ">" > : DEFAULT}<WithinTag> TOKEN:{  < ArgName:   (~[" ","\t","\r","\n","=",">","'","\""])               (~[" ","\t","\r","\n","=",">"])* >| < ArgEquals: "=" >  : AfterEquals| < TagEnd:    ">" | "=>" >  : DEFAULT}<AfterEquals> TOKEN:{  < ArgValue:  (~[" ","\t","\r","\n","=",">","'","\""])	       (~[" ","\t","\r","\n",">"])* > : WithinTag}<WithinTag, AfterEquals> TOKEN:{  < ArgQuote1: "'"  > : WithinQuote1| < ArgQuote2: "\"" > : WithinQuote2}<WithinTag, AfterEquals> SKIP:{  < <Space> >}<WithinQuote1> TOKEN:{  < Quote1Text:  (~["'"])+ >| < CloseQuote1: <ArgQuote1> > : WithinTag}<WithinQuote2> TOKEN:{  < Quote2Text:  (~["\""])+ >| < CloseQuote2: <ArgQuote2> > : WithinTag}<WithinComment1> TOKEN :{  < CommentText1:  (~["-"])+ | "-" >| < CommentEnd1:   "-->" > : DEFAULT}<WithinComment2> TOKEN :{  < CommentText2:  (~[">"])+ >| < CommentEnd2:   ">" > : DEFAULT}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -