📄 mychineseanalyzer.java
字号:
/**
*
*/
package com.lsp.modules.wordsplit;
import org.apache.lucene.analysis.*;
import java.io.Reader;
import java.io.*;
import com.xjt.nlp.word.ICTCLAS;
import java.util.Set;
/**
* @author lsp Created on 2007-1-1 21:02:02
*/
public final class MyChineseAnalyzer extends Analyzer {
private Set stopWords;
private static ICTCLAS instance=null;
/**
*
*/
public static final String[] ENGLISH_STOP_WORDS = {
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "s", "such",
"t", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with",
"我","我们"
};
/** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
public MyChineseAnalyzer() {
stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
}
/** Builds an analyzer which removes words in the provided array. */
public MyChineseAnalyzer(String[] stopWords) {
this.stopWords = StopFilter.makeStopSet(stopWords);
}
//used by conver Reader to String
public String readerToString(Reader reader)throws IOException{
BufferedReader br = new BufferedReader(reader);
String ttt = null;
String tttt = null;
while((ttt=br.readLine())!=null){
tttt += ttt;
}
return tttt;
}
/** Filters LowerCaseTokenizer with StopFilter. */
public TokenStream tokenStream(String fieldName, Reader reader) {
try{
ICTCLAS splitWord = ICTCLAS.getInstance();
String inputString = this.readerToString(reader);//FileIO.readerToString(reader);
String resultString = splitWord.paragraphProcess(inputString);
System.out.println("spliteResult"+resultString);
return new StopFilter(new LowerCaseTokenizer(new StringReader(resultString)),stopWords);
}
catch (Exception e){
System.out.println("转换出错");
return null;
}
}
/**
* @param args
* @return void
* @see
* TODO
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -