📄 mmchinesetokenizer.java
字号:
/*
* MMChineseTokenizer.java
*
* Created on 2007年1月4日, 上午1:50
*
* To change this template, choose Tools | Template Manager
* and open the template in the editor.
*/
package util.word;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.TreeMap;
/*
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
*/
/**
*
* @author JinfengLee
*/
public class MMChineseTokenizer extends Tokenizer {
//没有处理7字或7字以上的次,如果您需要处理可以修改这里
private static final int WORD_MAX_LENGTH = 6;
private static TreeMap<String, String> dictionary = null;
private static final int IO_BUFFER_SIZE = 50*1024;
private int bufferIndex = 0;
private int dataLength = 0;
private int offset = 0;
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
private String tokenType = "word";
private String dictionaryPath ;
public MMChineseTokenizer(String dictionaryPath) {
this.dictionaryPath = dictionaryPath ;
//装载词典
loadWords();
}
public void setReader(Reader input){
this.input = input;
}
public Token next() throws IOException {
if (input == null) return null;
StringBuffer word = new StringBuffer();
while (true) {
char c;
char nextChar;
Character.UnicodeBlock cUnicodeBlock;
Character.UnicodeBlock nextCharUnicodeBlock;
offset++;
if (bufferIndex >= dataLength) {
dataLength = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLength == -1) {
if (word.length() == 0) {
return null;
} else {
break;
}
}
c = ioBuffer[bufferIndex++];
cUnicodeBlock = Character.UnicodeBlock.of(c);
nextChar = ioBuffer[bufferIndex];
nextCharUnicodeBlock = Character.UnicodeBlock.of(nextChar);
boolean isSameUnicodeBlock = cUnicodeBlock.toString().equalsIgnoreCase(
nextCharUnicodeBlock.toString());
if (cUnicodeBlock == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
tokenType = "double";
if (word.length() == 0) {
word.append(c);
// 增强部分--开始
if (word.length() != 0 && (!isSameUnicodeBlock)) {
break;
}
// 增强部分--结束
} else {
String temp = (word.toString() + c).intern();
if (dictionary.containsKey(temp)) {
word.append(c);
// 增强部分--开始
if (word.length() != 0 && (!isSameUnicodeBlock)) {
break;
}
// 增强部分--结束
} else {
bufferIndex--;
offset--;
break;
}
}
} else if (cUnicodeBlock == Character.UnicodeBlock.BASIC_LATIN) {
tokenType = "single";
if (Character.isWhitespace(c)) {
if (word.length() != 0)
break;
} else {
word.append(c);
// 增强部分--开始
if (word.length() != 0 && (!isSameUnicodeBlock)) {
break;
}
// 增强部分--结束
}
}
}
Token token = new Token(word.toString(), offset
- word.length(), offset, tokenType);
word.setLength(0);
return token;
}
public void loadWords() {
if (dictionary == null) {
dictionary = new TreeMap<String, String>();
InputStream is = null;
InputStreamReader isr = null;
BufferedReader br = null;
try {
//String path = System.getProperty("user.dir");
//System.out.println(path);
is = new FileInputStream(dictionaryPath);
isr = new InputStreamReader(is, "UTF-8");
br = new BufferedReader(isr);
String word = null;
while ((word = br.readLine()) != null) {
int wordLength = word.length();
if ((word.indexOf("#") == -1) && (wordLength <= WORD_MAX_LENGTH)) {
dictionary.put(word.intern(), "1");
int i = wordLength-1;
while(i >= 2){
String temp = word.substring(0, i).intern();
if (!dictionary.containsKey(temp)) {
dictionary.put(temp,"2");
}
i--;
}
}
}
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
if(br!=null){
br.close();
}
if(isr!=null){
isr.close();
}
if(is!=null){
is.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -