📄 wikipediatokenizerimpl.java
字号:
int i = 0; /* index in packed string */ int j = offset; /* index in unpacked array */ int l = packed.length(); while (i < l) { int count = packed.charAt(i++); int value = packed.charAt(i++); value--; do result[j++] = value; while (--count > 0); } return j; } /* error codes */ private static final int ZZ_UNKNOWN_ERROR = 0; private static final int ZZ_NO_MATCH = 1; private static final int ZZ_PUSHBACK_2BIG = 2; /* error messages for the codes above */ private static final String ZZ_ERROR_MSG[] = { "Unkown internal scanner error", "Error: could not match input", "Error: pushback value was too large" }; /** * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code> */ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); private static final String ZZ_ATTRIBUTE_PACKED_0 = "\11\0\1\11\7\1\1\11\10\1\1\11\2\1\1\11"+ "\13\1\1\11\6\1\2\11\3\0\1\11\14\0\2\1"+ "\2\11\1\1\1\0\1\1\1\0\1\1\1\0\1\1"+ "\3\0\7\1\2\0\1\1\1\0\15\1\3\0\1\1"+ "\1\11\3\0\1\1\1\11\5\0\1\1\4\0\1\1"+ "\2\0\2\1\2\0\1\1\5\0\1\11\3\1\3\0"+ "\1\1\2\0\1\11\30\0\1\1\2\0\3\11"; private static int [] zzUnpackAttribute() { int [] result = new int[178]; int offset = 0; offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); return result; } private static int zzUnpackAttribute(String packed, int offset, int [] result) { int i = 0; /* index in packed string */ int j = offset; /* index in unpacked array */ int l = packed.length(); while (i < l) { int count = packed.charAt(i++); int value = packed.charAt(i++); do result[j++] = value; while (--count > 0); } return j; } /** the input device */ private java.io.Reader zzReader; /** the current state of the DFA */ private int zzState; /** the current lexical state */ private int zzLexicalState = YYINITIAL; /** this buffer contains the current text to be matched and is the source of the yytext() string */ private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; /** the textposition at the last accepting state */ private int zzMarkedPos; /** the textposition at the last state to be included in yytext */ private int zzPushbackPos; /** the current text position in the buffer */ private int zzCurrentPos; /** startRead marks the beginning of the yytext() string in the buffer */ private int zzStartRead; /** endRead marks the last character in the buffer, that has been read from input */ private int zzEndRead; /** number of newlines encountered up to the start of the matched text */ private int yyline; /** the number of characters up to the start of the matched text */ private int yychar; /** * the number of characters from the last newline up to the start of the * matched text */ private int yycolumn; /** * zzAtBOL == true <=> the scanner is currently at the beginning of a line */ private boolean zzAtBOL = true; /** zzAtEOF == true <=> the scanner is at the EOF */ private boolean zzAtEOF; /* user code: */public static final int ALPHANUM = WikipediaTokenizer.ALPHANUM_ID;public static final int APOSTROPHE = WikipediaTokenizer.APOSTROPHE_ID;public static final int ACRONYM = WikipediaTokenizer.ACRONYM_ID;public static final int COMPANY = WikipediaTokenizer.COMPANY_ID;public static final int EMAIL = WikipediaTokenizer.EMAIL_ID;public static final int HOST = WikipediaTokenizer.HOST_ID;public static final int NUM = WikipediaTokenizer.NUM_ID;public static final int CJ = WikipediaTokenizer.CJ_ID;public static final int INTERNAL_LINK = WikipediaTokenizer.INTERNAL_LINK_ID;public static final int EXTERNAL_LINK = WikipediaTokenizer.EXTERNAL_LINK_ID;public static final int CITATION = WikipediaTokenizer.CITATION_ID;public static final int CATEGORY = WikipediaTokenizer.CATEGORY_ID;public static final int BOLD = WikipediaTokenizer.BOLD_ID;public static final int ITALICS = WikipediaTokenizer.ITALICS_ID;public static final int BOLD_ITALICS = WikipediaTokenizer.BOLD_ITALICS_ID;public static final int HEADING = WikipediaTokenizer.HEADING_ID;public static final int SUB_HEADING = WikipediaTokenizer.SUB_HEADING_ID;public static final int EXTERNAL_LINK_URL = WikipediaTokenizer.EXTERNAL_LINK_URL_ID;private int currentTokType;private int numBalanced = 0;private int positionInc = 1;private int numLinkToks = 0;public static final String [] TOKEN_TYPES = WikipediaTokenizer.TOKEN_TYPES;public final int yychar(){ return yychar;}public final int getPositionIncrement(){ return positionInc;}/** * Fills Lucene token with the current token text. */final void getText(Token t, int tokType) { t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);} /** * Creates a new scanner * There is also a java.io.InputStream version of this constructor. * * @param in the java.io.Reader to read input from. */ WikipediaTokenizerImpl(java.io.Reader in) { this.zzReader = in; } /** * Creates a new scanner. * There is also java.io.Reader version of this constructor. * * @param in the java.io.Inputstream to read input from. */ WikipediaTokenizerImpl(java.io.InputStream in) { this(new java.io.InputStreamReader(in)); } /** * Unpacks the compressed character translation table. * * @param packed the packed character translation table * @return the unpacked character translation table */ private static char [] zzUnpackCMap(String packed) { char [] map = new char[0x10000]; int i = 0; /* index in packed string */ int j = 0; /* index in unpacked array */ while (i < 230) { int count = packed.charAt(i++); char value = packed.charAt(i++); do map[j++] = value; while (--count > 0); } return map; } /** * Refills the input buffer. * * @return <code>false</code>, iff there was new input. * * @exception java.io.IOException if any I/O-Error occurs */ private boolean zzRefill() throws java.io.IOException { /* first: make room (if you can) */ if (zzStartRead > 0) { System.arraycopy(zzBuffer, zzStartRead, zzBuffer, 0, zzEndRead-zzStartRead); /* translate stored positions */ zzEndRead-= zzStartRead; zzCurrentPos-= zzStartRead; zzMarkedPos-= zzStartRead; zzPushbackPos-= zzStartRead; zzStartRead = 0; } /* is the buffer big enough? */ if (zzCurrentPos >= zzBuffer.length) { /* if not: blow it up */ char newBuffer[] = new char[zzCurrentPos*2]; System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); zzBuffer = newBuffer; } /* finally: fill the buffer with new input */ int numRead = zzReader.read(zzBuffer, zzEndRead, zzBuffer.length-zzEndRead); if (numRead < 0) { return true; } else { zzEndRead+= numRead; return false; } } /** * Closes the input stream. */ public final void yyclose() throws java.io.IOException { zzAtEOF = true; /* indicate end of file */ zzEndRead = zzStartRead; /* invalidate buffer */ if (zzReader != null) zzReader.close(); } /** * Resets the scanner to read from a new input stream. * Does not close the old reader. * * All internal variables are reset, the old input stream * <b>cannot</b> be reused (internal buffer is discarded and lost). * Lexical state is set to <tt>ZZ_INITIAL</tt>. * * @param reader the new input stream */ public final void yyreset(java.io.Reader reader) { zzReader = reader; zzAtBOL = true; zzAtEOF = false; zzEndRead = zzStartRead = 0; zzCurrentPos = zzMarkedPos = zzPushbackPos = 0; yyline = yychar = yycolumn = 0; zzLexicalState = YYINITIAL; } /** * Returns the current lexical state. */ public final int yystate() { return zzLexicalState; } /** * Enters a new lexical state * * @param newState the new lexical state */ public final void yybegin(int newState) { zzLexicalState = newState; } /** * Returns the text matched by the current regular expression. */ public final String yytext() { return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); } /** * Returns the character at position <tt>pos</tt> from the * matched text. * * It is equivalent to yytext().charAt(pos), but faster * * @param pos the position of the character to fetch. * A value from 0 to yylength()-1. * * @return the character at position pos */ public final char yycharat(int pos) { return zzBuffer[zzStartRead+pos]; } /** * Returns the length of the matched text region. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -