⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlparser.java

📁 Lucene是Java社区中著名的全文检索工具.它是一个开源工具包,使用Java实现.
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */package org.apache.lucene.demo.html;import java.io.*;import java.util.Properties;public class HTMLParser implements HTMLParserConstants {  public static int SUMMARY_LENGTH = 200;  StringBuffer title = new StringBuffer(SUMMARY_LENGTH);  StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);  Properties metaTags=new Properties();  String currentMetaTag=null;  String currentMetaContent=null;  int length = 0;  boolean titleComplete = false;  boolean inTitle = false;  boolean inMetaTag = false;  boolean inStyle = false;  boolean afterTag = false;  boolean afterSpace = false;  String eol = System.getProperty("line.separator");  Reader pipeIn = null;  Writer pipeOut;  private MyPipedInputStream pipeInStream = null;  private PipedOutputStream pipeOutStream = null;  private class MyPipedInputStream extends PipedInputStream{    public MyPipedInputStream(){      super();    }    public MyPipedInputStream(PipedOutputStream src) throws IOException{      super(src);    }    public boolean full() throws IOException{      return this.available() >= PipedInputStream.PIPE_SIZE;    }  }  /**   * @deprecated Use HTMLParser(FileInputStream) instead   */  public HTMLParser(File file) throws FileNotFoundException {    this(new FileInputStream(file));  }  public String getTitle() throws IOException, InterruptedException {    if (pipeIn == null)      getReader();                                // spawn parsing thread    while (true) {      synchronized(this) {        if (titleComplete || pipeInStream.full())          break;        wait(10);      }    }    return title.toString().trim();  }  public Properties getMetaTags() throws IOException,InterruptedException {    if (pipeIn == null)      getReader();                                // spawn parsing thread    while (true) {      synchronized(this) {        if (titleComplete || pipeInStream.full())          break;        wait(10);      }    }    return metaTags;  }  public String getSummary() throws IOException, InterruptedException {    if (pipeIn == null)      getReader();                                // spawn parsing thread    while (true) {      synchronized(this) {        if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())          break;        wait(10);      }    }    if (summary.length() > SUMMARY_LENGTH)      summary.setLength(SUMMARY_LENGTH);    String sum = summary.toString().trim();    String tit = getTitle();    if (sum.startsWith(tit) || sum.equals(""))      return tit;    else      return sum;  }  public Reader getReader() throws IOException {    if (pipeIn == null) {      pipeInStream = new MyPipedInputStream();      pipeOutStream = new PipedOutputStream(pipeInStream);      pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");      pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");      Thread thread = new ParserThread(this);      thread.start();                             // start parsing    }    return pipeIn;  }  void addToSummary(String text) {    if (summary.length() < SUMMARY_LENGTH) {      summary.append(text);      if (summary.length() >= SUMMARY_LENGTH) {        synchronized(this) {          notifyAll();        }      }    }  }  void addText(String text) throws IOException {    if (inStyle)      return;    if (inTitle)      title.append(text);    else {      addToSummary(text);      if (!titleComplete && !title.equals("")) {  // finished title        synchronized(this) {          titleComplete = true;                   // tell waiting threads          notifyAll();        }      }    }    length += text.length();    pipeOut.write(text);    afterSpace = false;  }  void addMetaTag() {      metaTags.setProperty(currentMetaTag, currentMetaContent);      currentMetaTag = null;      currentMetaContent = null;      return;  }  void addSpace() throws IOException {    if (!afterSpace) {      if (inTitle)        title.append(" ");      else        addToSummary(" ");      String space = afterTag ? eol : " ";      length += space.length();      pipeOut.write(space);      afterSpace = true;    }  }  final public void HTMLDocument() throws ParseException, IOException {  Token t;    label_1:    while (true) {      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {      case ScriptStart:      case TagName:      case DeclName:      case Comment1:      case Comment2:      case Word:      case Entity:      case Space:      case Punct:        ;        break;      default:        jj_la1[0] = jj_gen;        break label_1;      }      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {      case TagName:        Tag();                      afterTag = true;        break;      case DeclName:        t = Decl();                      afterTag = true;        break;      case Comment1:      case Comment2:        CommentTag();                      afterTag = true;        break;      case ScriptStart:        ScriptTag();                     afterTag = true;        break;      case Word:        t = jj_consume_token(Word);                      addText(t.image); afterTag = false;        break;      case Entity:        t = jj_consume_token(Entity);                      addText(Entities.decode(t.image)); afterTag = false;        break;      case Punct:        t = jj_consume_token(Punct);                      addText(t.image); afterTag = false;        break;      case Space:        jj_consume_token(Space);                      addSpace(); afterTag = false;        break;      default:        jj_la1[1] = jj_gen;        jj_consume_token(-1);        throw new ParseException();      }    }    jj_consume_token(0);  }  final public void Tag() throws ParseException, IOException {  Token t1, t2;  boolean inImg = false;    t1 = jj_consume_token(TagName);   String tagName = t1.image.toLowerCase();   if(Tags.WS_ELEMS.contains(tagName) ) {      addSpace();    }    inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>    inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>    inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>    inImg = tagName.equalsIgnoreCase("<img");     // keep track if in <IMG>    label_2:    while (true) {      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {      case ArgName:        ;        break;      default:        jj_la1[2] = jj_gen;        break label_2;      }      t1 = jj_consume_token(ArgName);      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {      case ArgEquals:        jj_consume_token(ArgEquals);        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {        case ArgValue:        case ArgQuote1:        case ArgQuote2:          t2 = ArgValue();       if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)         addText("[" + t2.image + "]");        if(inMetaTag &&                        (  t1.image.equalsIgnoreCase("name") ||                           t1.image.equalsIgnoreCase("HTTP-EQUIV")                        )           && t2 != null)        {                currentMetaTag=t2.image.toLowerCase();                if(currentMetaTag != null && currentMetaContent != null) {                addMetaTag();                }        }        if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=null)        {                currentMetaContent=t2.image.toLowerCase();                if(currentMetaTag != null && currentMetaContent != null) {                addMetaTag();                }        }          break;        default:          jj_la1[3] = jj_gen;          ;        }        break;      default:        jj_la1[4] = jj_gen;        ;      }    }    jj_consume_token(TagEnd);  }  final public Token ArgValue() throws ParseException {  Token t = null;    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {    case ArgValue:      t = jj_consume_token(ArgValue);                                              {if (true) return t;}      break;    default:      jj_la1[5] = jj_gen;      if (jj_2_1(2)) {        jj_consume_token(ArgQuote1);        jj_consume_token(CloseQuote1);                                              {if (true) return t;}      } else {        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {        case ArgQuote1:          jj_consume_token(ArgQuote1);          t = jj_consume_token(Quote1Text);          jj_consume_token(CloseQuote1);                                              {if (true) return t;}          break;        default:          jj_la1[6] = jj_gen;          if (jj_2_2(2)) {            jj_consume_token(ArgQuote2);            jj_consume_token(CloseQuote2);                                              {if (true) return t;}          } else {            switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {            case ArgQuote2:              jj_consume_token(ArgQuote2);              t = jj_consume_token(Quote2Text);              jj_consume_token(CloseQuote2);                                              {if (true) return t;}              break;            default:              jj_la1[7] = jj_gen;              jj_consume_token(-1);              throw new ParseException();            }          }        }      }    }    throw new Error("Missing return statement in function");  }  final public Token Decl() throws ParseException {  Token t;    t = jj_consume_token(DeclName);    label_3:    while (true) {      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {      case ArgName:      case ArgEquals:      case ArgValue:      case ArgQuote1:      case ArgQuote2:        ;        break;      default:        jj_la1[8] = jj_gen;        break label_3;      }      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {      case ArgName:        jj_consume_token(ArgName);        break;      case ArgValue:      case ArgQuote1:      case ArgQuote2:        ArgValue();        break;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -