⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlparser.java

📁 simple spider 源码 better light faster java 书籍源码.
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */
package org.apache.lucene.demo.html;

import java.io.*;

public class HTMLParser implements HTMLParserConstants {
  public static int SUMMARY_LENGTH = 200;

  StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
  StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
  int length = 0;
  boolean titleComplete = false;
  boolean inTitle = false;
  boolean inScript = false;
  boolean afterTag = false;
  boolean afterSpace = false;
  String eol = System.getProperty("line.separator");
  PipedReader pipeIn = null;
  PipedWriter pipeOut;

  public HTMLParser(File file) throws FileNotFoundException {
    this(new FileInputStream(file));
  }

  public String getTitle() throws IOException, InterruptedException {
    if (pipeIn == null)
      getReader();                                // spawn parsing thread
    while (true) {
      synchronized(this) {
        if (titleComplete || (length > SUMMARY_LENGTH))
          break;
        wait(10);
      }
    }
    return title.toString().trim();
  }

  public String getSummary() throws IOException, InterruptedException {
    if (pipeIn == null)
      getReader();                                // spawn parsing thread
    while (true) {
      synchronized(this) {
        if (summary.length() >= SUMMARY_LENGTH)
          break;
        wait(10);
      }
    }
    if (summary.length() > SUMMARY_LENGTH)
      summary.setLength(SUMMARY_LENGTH);

    String sum = summary.toString().trim();
    String tit = getTitle();
    if (sum.startsWith(tit))
      return sum.substring(tit.length());
    else
      return sum;
  }

  public Reader getReader() throws IOException {
    if (pipeIn == null) {
      pipeIn = new PipedReader();
      pipeOut = new PipedWriter(pipeIn);

      Thread thread = new ParserThread(this);
      thread.start();                             // start parsing
    }

    return pipeIn;
  }

  void addToSummary(String text) {
    if (summary.length() < SUMMARY_LENGTH) {
      summary.append(text);
      if (summary.length() >= SUMMARY_LENGTH) {
        synchronized(this) {
          notifyAll();
        }
      }
    }
  }

  void addText(String text) throws IOException {
    if (inScript)
      return;
    if (inTitle)
      title.append(text);
    else {
      addToSummary(text);
      if (!titleComplete && !title.equals("")) {  // finished title
        synchronized(this) {
          titleComplete = true;                   // tell waiting threads
          notifyAll();
        }
      }
    }

    length += text.length();
    pipeOut.write(text);

    afterSpace = false;
  }

  void addSpace() throws IOException {
    if (inScript)
      return;
    if (!afterSpace) {
      if (inTitle)
        title.append(" ");
      else
        addToSummary(" ");

      String space = afterTag ? eol : " ";
      length += space.length();
      pipeOut.write(space);
      afterSpace = true;
    }
  }

  final public void HTMLDocument() throws ParseException, IOException {
  Token t;
    label_1:
    while (true) {
      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
      case TagName:
      case DeclName:
      case Comment1:
      case Comment2:
      case Word:
      case Entity:
      case Space:
      case Punct:
        ;
        break;
      default:
        jj_la1[0] = jj_gen;
        break label_1;
      }
      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
      case TagName:
        Tag();
                      afterTag = true;
        break;
      case DeclName:
        t = Decl();
                      afterTag = true;
        break;
      case Comment1:
      case Comment2:
        CommentTag();
                      afterTag = true;
        break;
      case Word:
        t = jj_consume_token(Word);
                      addText(t.image); afterTag = false;
        break;
      case Entity:
        t = jj_consume_token(Entity);
                      addText(Entities.decode(t.image)); afterTag = false;
        break;
      case Punct:
        t = jj_consume_token(Punct);
                      addText(t.image); afterTag = false;
        break;
      case Space:
        jj_consume_token(Space);
                      addSpace(); afterTag = false;
        break;
      default:
        jj_la1[1] = jj_gen;
        jj_consume_token(-1);
        throw new ParseException();
      }
    }
    jj_consume_token(0);
  }

  final public void Tag() throws ParseException, IOException {
  Token t1, t2;
  boolean inImg = false;
    t1 = jj_consume_token(TagName);
    inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
    inImg = t1.image.equalsIgnoreCase("<img");    // keep track if in <IMG>
    if (inScript) {                               // keep track if in <SCRIPT>
      inScript = !t1.image.equalsIgnoreCase("</script");
    } else {
      inScript = t1.image.equalsIgnoreCase("<script");
    }
    label_2:
    while (true) {
      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
      case ArgName:
        ;
        break;
      default:
        jj_la1[2] = jj_gen;
        break label_2;
      }
      t1 = jj_consume_token(ArgName);
      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
      case ArgEquals:
        jj_consume_token(ArgEquals);
        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
        case ArgValue:
        case ArgQuote1:
        case ArgQuote2:
          t2 = ArgValue();
       if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
         addText("[" + t2.image + "]");
          break;
        default:
          jj_la1[3] = jj_gen;
          ;
        }
        break;
      default:
        jj_la1[4] = jj_gen;
        ;
      }
    }
    jj_consume_token(TagEnd);
  }

  final public Token ArgValue() throws ParseException {
  Token t = null;
    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
    case ArgValue:
      t = jj_consume_token(ArgValue);
                                              {if (true) return t;}
      break;
    default:
      jj_la1[5] = jj_gen;
      if (jj_2_1(2)) {
        jj_consume_token(ArgQuote1);
        jj_consume_token(CloseQuote1);
                                              {if (true) return t;}
      } else {
        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
        case ArgQuote1:
          jj_consume_token(ArgQuote1);
          t = jj_consume_token(Quote1Text);
          jj_consume_token(CloseQuote1);
                                              {if (true) return t;}
          break;
        default:
          jj_la1[6] = jj_gen;
          if (jj_2_2(2)) {
            jj_consume_token(ArgQuote2);
            jj_consume_token(CloseQuote2);
                                              {if (true) return t;}
          } else {
            switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
            case ArgQuote2:
              jj_consume_token(ArgQuote2);
              t = jj_consume_token(Quote2Text);
              jj_consume_token(CloseQuote2);
                                              {if (true) return t;}
              break;
            default:
              jj_la1[7] = jj_gen;
              jj_consume_token(-1);
              throw new ParseException();
            }
          }
        }
      }
    }
    throw new Error("Missing return statement in function");
  }

  final public Token Decl() throws ParseException {
  Token t;
    t = jj_consume_token(DeclName);
    label_3:
    while (true) {
      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
      case ArgName:
      case ArgEquals:
      case ArgValue:
      case ArgQuote1:
      case ArgQuote2:
        ;
        break;
      default:
        jj_la1[8] = jj_gen;
        break label_3;
      }
      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
      case ArgName:
        jj_consume_token(ArgName);
        break;
      case ArgValue:
      case ArgQuote1:
      case ArgQuote2:
        ArgValue();
        break;
      case ArgEquals:
        jj_consume_token(ArgEquals);
        break;
      default:
        jj_la1[9] = jj_gen;
        jj_consume_token(-1);
        throw new ParseException();
      }
    }
    jj_consume_token(TagEnd);
    {if (true) return t;}
    throw new Error("Missing return statement in function");
  }

  final public void CommentTag() throws ParseException {
    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
    case Comment1:
      jj_consume_token(Comment1);
      label_4:
      while (true) {
        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
        case CommentText1:
          ;
          break;
        default:
          jj_la1[10] = jj_gen;
          break label_4;
        }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -