📄 cjktokenizer.java
字号:
package org.apache.lucene.analysis.cjk;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.Tokenizer;import java.io.Reader;/** * CJKTokenizer was modified from StopTokenizer which does a decent job for * most European languages. It performs other token methods for double-byte * Characters: the token will return at each two charactors with overlap match.<br> * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it * also need filter filter zero length token ""<br> * for Digit: digit, '+', '#' will token as letter<br> * for more info on Asia language(Chinese Japanese Korean) text segmentation: * please search <a * href="http://www.google.com/search?q=word+chinese+segment">google</a> * * @author Che, Dong */public final class CJKTokenizer extends Tokenizer { //~ Static fields/initializers --------------------------------------------- /** Max word length */ private static final int MAX_WORD_LEN = 255; /** buffer size: */ private static final int IO_BUFFER_SIZE = 256; //~ Instance fields -------------------------------------------------------- /** word offset, used to imply which character(in ) is parsed */ private int offset = 0; /** the index used only for ioBuffer */ private int bufferIndex = 0; /** data length */ private int dataLen = 0; /** * character buffer, store the characters which are used to compose <br> * the returned Token */ private final char[] buffer = new char[MAX_WORD_LEN]; /** * I/O buffer, used to store the content of the input(one of the <br> * members of Tokenizer) */ private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; /** word type: single=>ASCII double=>non-ASCII word=>default */ private String tokenType = "word"; /** * tag: previous character is a cached double-byte character "C1C2C3C4" * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened) * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4" */ private boolean preIsTokened = false; //~ Constructors ----------------------------------------------------------- /** * Construct a token stream processing the given input. * * @param in I/O reader */ public CJKTokenizer(Reader in) { input = in; } //~ Methods ---------------------------------------------------------------- /** * Returns the next token in the stream, or null at EOS. * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html * for detail. * * @return Token * * @throws java.io.IOException - throw IOException when read error <br> * hanppened in the InputStream * */ public final Token next() throws java.io.IOException { /** how many character(s) has been stored in buffer */ int length = 0; /** the position used to create Token */ int start = offset; while (true) { /** current charactor */ char c; /** unicode block of current charactor for detail */ Character.UnicodeBlock ub; offset++; if (bufferIndex >= dataLen) { dataLen = input.read(ioBuffer); bufferIndex = 0; } if (dataLen == -1) { if (length > 0) { if (preIsTokened == true) { length = 0; preIsTokened = false; } break; } else { return null; } } else { //get current character c = ioBuffer[bufferIndex++]; //get the UnicodeBlock of the current character ub = Character.UnicodeBlock.of(c); } //if the current character is ASCII or Extend ASCII if ((ub == Character.UnicodeBlock.BASIC_LATIN) || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) ) { if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) { /** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */ int i = (int) c; i = i - 65248; c = (char) i; } // if the current character is a letter or "_" "+" "#" if (Character.isLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#')) ) { if (length == 0) { // "javaC1C2C3C4linux" <br> // ^--: the current character begin to token the ASCII // letter start = offset - 1; } else if (tokenType == "double") { // "javaC1C2C3C4linux" <br> // ^--: the previous non-ASCII // : the current character offset--; bufferIndex--; tokenType = "single"; if (preIsTokened == true) { // there is only one non-ASCII has been stored length = 0; preIsTokened = false; break; } else { break; } } // store the LowerCase(c) in the buffer buffer[length++] = Character.toLowerCase(c); tokenType = "single"; // break the procedure if buffer overflowed! if (length == MAX_WORD_LEN) { break; } } else if (length > 0) { if (preIsTokened == true) { length = 0; preIsTokened = false; } else { break; } } } else { // non-ASCII letter, eg."C1C2C3C4" if (Character.isLetter(c)) { if (length == 0) { start = offset - 1; buffer[length++] = c; tokenType = "double"; } else { if (tokenType == "single") { offset--; bufferIndex--; //return the previous ASCII characters break; } else { buffer[length++] = c; tokenType = "double"; if (length == 2) { offset--; bufferIndex--; preIsTokened = true; break; } } } } else if (length > 0) { if (preIsTokened == true) { // empty the buffer length = 0; preIsTokened = false; } else { break; } } } } return new Token(new String(buffer, 0, length), start, start + length, tokenType ); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -