📄 wikipediatokenizerimpl.java

📁 Lucene a java open-source SearchEngine Framework
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
    int i = 0;       /* index in packed string  */    int j = offset;  /* index in unpacked array */    int l = packed.length();    while (i < l) {      int count = packed.charAt(i++);      int value = packed.charAt(i++);      value--;      do result[j++] = value; while (--count > 0);    }    return j;  }  /* error codes */  private static final int ZZ_UNKNOWN_ERROR = 0;  private static final int ZZ_NO_MATCH = 1;  private static final int ZZ_PUSHBACK_2BIG = 2;  /* error messages for the codes above */  private static final String ZZ_ERROR_MSG[] = {    "Unkown internal scanner error",    "Error: could not match input",    "Error: pushback value was too large"  };  /**   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>   */  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();  private static final String ZZ_ATTRIBUTE_PACKED_0 =    "\11\0\1\11\7\1\1\11\10\1\1\11\2\1\1\11"+    "\13\1\1\11\6\1\2\11\3\0\1\11\14\0\2\1"+    "\2\11\1\1\1\0\1\1\1\0\1\1\1\0\1\1"+    "\3\0\7\1\2\0\1\1\1\0\15\1\3\0\1\1"+    "\1\11\3\0\1\1\1\11\5\0\1\1\4\0\1\1"+    "\2\0\2\1\2\0\1\1\5\0\1\11\3\1\3\0"+    "\1\1\2\0\1\11\30\0\1\1\2\0\3\11";  private static int [] zzUnpackAttribute() {    int [] result = new int[178];    int offset = 0;    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);    return result;  }  private static int zzUnpackAttribute(String packed, int offset, int [] result) {    int i = 0;       /* index in packed string  */    int j = offset;  /* index in unpacked array */    int l = packed.length();    while (i < l) {      int count = packed.charAt(i++);      int value = packed.charAt(i++);      do result[j++] = value; while (--count > 0);    }    return j;  }  /** the input device */  private java.io.Reader zzReader;  /** the current state of the DFA */  private int zzState;  /** the current lexical state */  private int zzLexicalState = YYINITIAL;  /** this buffer contains the current text to be matched and is      the source of the yytext() string */  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];  /** the textposition at the last accepting state */  private int zzMarkedPos;  /** the textposition at the last state to be included in yytext */  private int zzPushbackPos;  /** the current text position in the buffer */  private int zzCurrentPos;  /** startRead marks the beginning of the yytext() string in the buffer */  private int zzStartRead;  /** endRead marks the last character in the buffer, that has been read      from input */  private int zzEndRead;  /** number of newlines encountered up to the start of the matched text */  private int yyline;  /** the number of characters up to the start of the matched text */  private int yychar;  /**   * the number of characters from the last newline up to the start of the    * matched text   */  private int yycolumn;  /**    * zzAtBOL == true <=> the scanner is currently at the beginning of a line   */  private boolean zzAtBOL = true;  /** zzAtEOF == true <=> the scanner is at the EOF */  private boolean zzAtEOF;  /* user code: */public static final int ALPHANUM          = WikipediaTokenizer.ALPHANUM_ID;public static final int APOSTROPHE        = WikipediaTokenizer.APOSTROPHE_ID;public static final int ACRONYM           = WikipediaTokenizer.ACRONYM_ID;public static final int COMPANY           = WikipediaTokenizer.COMPANY_ID;public static final int EMAIL             = WikipediaTokenizer.EMAIL_ID;public static final int HOST              = WikipediaTokenizer.HOST_ID;public static final int NUM               = WikipediaTokenizer.NUM_ID;public static final int CJ                = WikipediaTokenizer.CJ_ID;public static final int INTERNAL_LINK     = WikipediaTokenizer.INTERNAL_LINK_ID;public static final int EXTERNAL_LINK     = WikipediaTokenizer.EXTERNAL_LINK_ID;public static final int CITATION          = WikipediaTokenizer.CITATION_ID;public static final int CATEGORY          = WikipediaTokenizer.CATEGORY_ID;public static final int BOLD              = WikipediaTokenizer.BOLD_ID;public static final int ITALICS           = WikipediaTokenizer.ITALICS_ID;public static final int BOLD_ITALICS      = WikipediaTokenizer.BOLD_ITALICS_ID;public static final int HEADING           = WikipediaTokenizer.HEADING_ID;public static final int SUB_HEADING       = WikipediaTokenizer.SUB_HEADING_ID;public static final int EXTERNAL_LINK_URL = WikipediaTokenizer.EXTERNAL_LINK_URL_ID;private int currentTokType;private int numBalanced = 0;private int positionInc = 1;private int numLinkToks = 0;public static final String [] TOKEN_TYPES = WikipediaTokenizer.TOKEN_TYPES;public final int yychar(){    return yychar;}public final int getPositionIncrement(){  return positionInc;}/** * Fills Lucene token with the current token text. */final void getText(Token t, int tokType) {  t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);}  /**   * Creates a new scanner   * There is also a java.io.InputStream version of this constructor.   *   * @param   in  the java.io.Reader to read input from.   */  WikipediaTokenizerImpl(java.io.Reader in) {    this.zzReader = in;  }  /**   * Creates a new scanner.   * There is also java.io.Reader version of this constructor.   *   * @param   in  the java.io.Inputstream to read input from.   */  WikipediaTokenizerImpl(java.io.InputStream in) {    this(new java.io.InputStreamReader(in));  }  /**    * Unpacks the compressed character translation table.   *   * @param packed   the packed character translation table   * @return         the unpacked character translation table   */  private static char [] zzUnpackCMap(String packed) {    char [] map = new char[0x10000];    int i = 0;  /* index in packed string  */    int j = 0;  /* index in unpacked array */    while (i < 230) {      int  count = packed.charAt(i++);      char value = packed.charAt(i++);      do map[j++] = value; while (--count > 0);    }    return map;  }  /**   * Refills the input buffer.   *   * @return      <code>false</code>, iff there was new input.   *    * @exception   java.io.IOException  if any I/O-Error occurs   */  private boolean zzRefill() throws java.io.IOException {    /* first: make room (if you can) */    if (zzStartRead > 0) {      System.arraycopy(zzBuffer, zzStartRead,                       zzBuffer, 0,                       zzEndRead-zzStartRead);      /* translate stored positions */      zzEndRead-= zzStartRead;      zzCurrentPos-= zzStartRead;      zzMarkedPos-= zzStartRead;      zzPushbackPos-= zzStartRead;      zzStartRead = 0;    }    /* is the buffer big enough? */    if (zzCurrentPos >= zzBuffer.length) {      /* if not: blow it up */      char newBuffer[] = new char[zzCurrentPos*2];      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);      zzBuffer = newBuffer;    }    /* finally: fill the buffer with new input */    int numRead = zzReader.read(zzBuffer, zzEndRead,                                            zzBuffer.length-zzEndRead);    if (numRead < 0) {      return true;    }    else {      zzEndRead+= numRead;      return false;    }  }      /**   * Closes the input stream.   */  public final void yyclose() throws java.io.IOException {    zzAtEOF = true;            /* indicate end of file */    zzEndRead = zzStartRead;  /* invalidate buffer    */    if (zzReader != null)      zzReader.close();  }  /**   * Resets the scanner to read from a new input stream.   * Does not close the old reader.   *   * All internal variables are reset, the old input stream    * <b>cannot</b> be reused (internal buffer is discarded and lost).   * Lexical state is set to <tt>ZZ_INITIAL</tt>.   *   * @param reader   the new input stream    */  public final void yyreset(java.io.Reader reader) {    zzReader = reader;    zzAtBOL  = true;    zzAtEOF  = false;    zzEndRead = zzStartRead = 0;    zzCurrentPos = zzMarkedPos = zzPushbackPos = 0;    yyline = yychar = yycolumn = 0;    zzLexicalState = YYINITIAL;  }  /**   * Returns the current lexical state.   */  public final int yystate() {    return zzLexicalState;  }  /**   * Enters a new lexical state   *   * @param newState the new lexical state   */  public final void yybegin(int newState) {    zzLexicalState = newState;  }  /**   * Returns the text matched by the current regular expression.   */  public final String yytext() {    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );  }  /**   * Returns the character at position <tt>pos</tt> from the    * matched text.    *    * It is equivalent to yytext().charAt(pos), but faster   *   * @param pos the position of the character to fetch.    *            A value from 0 to yylength()-1.   *   * @return the character at position pos   */  public final char yycharat(int pos) {    return zzBuffer[zzStartRead+pos];  }  /**   * Returns the length of the matched text region.   */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -