📄 mychineseanalyzer.java

📁 中文分词,中科院分词的改装版。使用java调用dll来实现的。

💻 JAVA

字号:

/**
 * 
 */
package com.lsp.modules.wordsplit;
import org.apache.lucene.analysis.*;
import java.io.Reader;
import java.io.*;
import com.xjt.nlp.word.ICTCLAS;
import java.util.Set;
/**
 * @author lsp Created on 2007-1-1 21:02:02
 */
public final class MyChineseAnalyzer extends Analyzer {
	private Set stopWords; 
	 private static ICTCLAS instance=null;
	/**
	 * 
	 */
	 public static final String[] ENGLISH_STOP_WORDS = { 
		    "a", "an", "and", "are", "as", "at", "be", "but", "by", 
		    "for", "if", "in", "into", "is", "it", 
		    "no", "not", "of", "on", "or", "s", "such", 
		    "t", "that", "the", "their", "then", "there", "these", 
		    "they", "this", "to", "was", "will", "with", 
		    "我","我们" 
		  }; 

	 /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */ 
	  public MyChineseAnalyzer() { 
	    stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS); 
	  } 

	  /** Builds an analyzer which removes words in the provided array. */ 
	  public MyChineseAnalyzer(String[] stopWords) { 
	    this.stopWords = StopFilter.makeStopSet(stopWords); 
	  } 
	  
	  //used by conver Reader to String 
	  public String readerToString(Reader reader)throws IOException{
		  BufferedReader br = new BufferedReader(reader);
		  String ttt = null;
		  String tttt = null;
		  while((ttt=br.readLine())!=null){
		  tttt += ttt;
		  }
		  return tttt;
	  }

	  /** Filters LowerCaseTokenizer with StopFilter. */ 
	  public TokenStream tokenStream(String fieldName, Reader reader) { 
	   try{ 
	     ICTCLAS splitWord = ICTCLAS.getInstance(); 
	     String inputString = this.readerToString(reader);//FileIO.readerToString(reader); 
	     String resultString = splitWord.paragraphProcess(inputString); 
	     System.out.println("spliteResult"+resultString);
	     return new StopFilter(new LowerCaseTokenizer(new StringReader(resultString)),stopWords); 
	   } 
	   catch (Exception e){ 
	    System.out.println("转换出错"); 
	    return null; 
	   } 
	  } 

	/**
	 * @param args
	 * @return void
	 * @see
	 * TODO
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub

	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C

搜索代码 Ctrl + F

全屏模式 F11

切换主题 Ctrl + Shift + D

显示快捷键 ?

增大字号 Ctrl + =

减小字号 Ctrl + -