standardtokenizer.java

来自「一套java版本的搜索引擎源码」· Java 代码 · 共 207 行

JAVA

207 行

/* Generated By:JavaCC: Do not edit this line. StandardTokenizer.java */package org.apache.lucene.analysis.standard;import java.io.*;/** A grammar-based tokenizer constructed with JavaCC. * * <p> This should be a good tokenizer for most European-language documents: * * <ul> *   <li>Splits words at punctuation characters, removing punctuation. However, a  *     dot that's not followed by whitespace is considered part of a token. *   <li>Splits words at hyphens, unless there's a number in the token, in which case *     the whole token is interpreted as a product number and is not split. *   <li>Recognizes email addresses and internet hostnames as one token. * </ul> * * <p>Many applications have specific tokenizer needs.  If this tokenizer does * not suit your application, please consider copying this source code * directory to your project and maintaining your own grammar-based tokenizer. */public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer implements StandardTokenizerConstants {  /** Constructs a tokenizer for this Reader. */  public StandardTokenizer(Reader reader) {    this(new FastCharStream(reader));    this.input = reader;  }/** Returns the next token in the stream, or null at EOS. * <p>The returned token's type is set to an element of {@link * StandardTokenizerConstants#tokenImage}. */  final public org.apache.lucene.analysis.Token next() throws ParseException, IOException {  Token token = null;    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {    case ALPHANUM:      token = jj_consume_token(ALPHANUM);      break;    case APOSTROPHE:      token = jj_consume_token(APOSTROPHE);      break;    case ACRONYM:      token = jj_consume_token(ACRONYM);      break;    case COMPANY:      token = jj_consume_token(COMPANY);      break;    case EMAIL:      token = jj_consume_token(EMAIL);      break;    case HOST:      token = jj_consume_token(HOST);      break;    case NUM:      token = jj_consume_token(NUM);      break;    case CJ:      token = jj_consume_token(CJ);      break;    case 0:      token = jj_consume_token(0);      break;    default:      jj_la1[0] = jj_gen;      jj_consume_token(-1);      throw new ParseException();    }      if (token.kind == EOF) {        {if (true) return null;}      } else {        {if (true) return          new org.apache.lucene.analysis.Token(token.image,                                        token.beginColumn,token.endColumn,                                        tokenImage[token.kind]);}      }    throw new Error("Missing return statement in function");  }  public StandardTokenizerTokenManager token_source;  public Token token, jj_nt;  private int jj_ntk;  private int jj_gen;  final private int[] jj_la1 = new int[1];  static private int[] jj_la1_0;  static {      jj_la1_0();   }   private static void jj_la1_0() {      jj_la1_0 = new int[] {0x10ff,};   }  public StandardTokenizer(CharStream stream) {    token_source = new StandardTokenizerTokenManager(stream);    token = new Token();    jj_ntk = -1;    jj_gen = 0;    for (int i = 0; i < 1; i++) jj_la1[i] = -1;  }  public void ReInit(CharStream stream) {    token_source.ReInit(stream);    token = new Token();    jj_ntk = -1;    jj_gen = 0;    for (int i = 0; i < 1; i++) jj_la1[i] = -1;  }  public StandardTokenizer(StandardTokenizerTokenManager tm) {    token_source = tm;    token = new Token();    jj_ntk = -1;    jj_gen = 0;    for (int i = 0; i < 1; i++) jj_la1[i] = -1;  }  public void ReInit(StandardTokenizerTokenManager tm) {    token_source = tm;    token = new Token();    jj_ntk = -1;    jj_gen = 0;    for (int i = 0; i < 1; i++) jj_la1[i] = -1;  }  final private Token jj_consume_token(int kind) throws ParseException {    Token oldToken;    if ((oldToken = token).next != null) token = token.next;    else token = token.next = token_source.getNextToken();    jj_ntk = -1;    if (token.kind == kind) {      jj_gen++;      return token;    }    token = oldToken;    jj_kind = kind;    throw generateParseException();  }  final public Token getNextToken() {    if (token.next != null) token = token.next;    else token = token.next = token_source.getNextToken();    jj_ntk = -1;    jj_gen++;    return token;  }  final public Token getToken(int index) {    Token t = token;    for (int i = 0; i < index; i++) {      if (t.next != null) t = t.next;      else t = t.next = token_source.getNextToken();    }    return t;  }  final private int jj_ntk() {    if ((jj_nt=token.next) == null)      return (jj_ntk = (token.next=token_source.getNextToken()).kind);    else      return (jj_ntk = jj_nt.kind);  }  private java.util.Vector jj_expentries = new java.util.Vector();  private int[] jj_expentry;  private int jj_kind = -1;  public ParseException generateParseException() {    jj_expentries.removeAllElements();    boolean[] la1tokens = new boolean[16];    for (int i = 0; i < 16; i++) {      la1tokens[i] = false;    }    if (jj_kind >= 0) {      la1tokens[jj_kind] = true;      jj_kind = -1;    }    for (int i = 0; i < 1; i++) {      if (jj_la1[i] == jj_gen) {        for (int j = 0; j < 32; j++) {          if ((jj_la1_0[i] & (1<<j)) != 0) {            la1tokens[j] = true;          }        }      }    }    for (int i = 0; i < 16; i++) {      if (la1tokens[i]) {        jj_expentry = new int[1];        jj_expentry[0] = i;        jj_expentries.addElement(jj_expentry);      }    }    int[][] exptokseq = new int[jj_expentries.size()][];    for (int i = 0; i < jj_expentries.size(); i++) {      exptokseq[i] = (int[])jj_expentries.elementAt(i);    }    return new ParseException(token, exptokseq, tokenImage);  }  final public void enable_tracing() {  }  final public void disable_tracing() {  }}

standardtokenizer.java - 源码说明

本页面展示了「一套java版本的搜索引擎源码」中的 standardtokenizer.java 源码文件，采用 Java 编程语言编写，共 207 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与java相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?