📄 mmchinesetokenizer.java

📁 本人自己写的机遇lucene的简单搜速引擎
💻 JAVA
字号:
/*
 * MMChineseTokenizer.java
 *
 * Created on 2007年1月4日, 上午1:50
 *
 * To change this template, choose Tools | Template Manager
 * and open the template in the editor.
 */

package util.word;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.TreeMap;

/*
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
*/
/**
 *
 * @author JinfengLee
 */
public class MMChineseTokenizer extends Tokenizer {
  //没有处理7字或7字以上的次，如果您需要处理可以修改这里
  private static final int WORD_MAX_LENGTH = 6;

  private static TreeMap<String, String> dictionary = null;

  private static final int IO_BUFFER_SIZE = 50*1024;

  private int bufferIndex = 0;

  private int dataLength = 0;
	
  private int offset = 0;

  private final char[] ioBuffer = new char[IO_BUFFER_SIZE];

  private String tokenType = "word";
  
  private String dictionaryPath ;

  public MMChineseTokenizer(String dictionaryPath) {
    this.dictionaryPath = dictionaryPath ;
    //装载词典
    loadWords();
  }
  
  public void setReader(Reader input){
        this.input = input;
  } 
  
  public Token next() throws IOException {
    if (input == null) return null;
    
    StringBuffer word = new StringBuffer();

    while (true) {
      char c;
      char nextChar;
      Character.UnicodeBlock cUnicodeBlock;
      Character.UnicodeBlock nextCharUnicodeBlock;
			
      offset++;
			
      if (bufferIndex >= dataLength) {
        dataLength = input.read(ioBuffer);
        bufferIndex = 0;
      }

      if (dataLength == -1) {
        if (word.length() == 0) {
          return null;
        } else {
            break;
        }
      }
			
      c = ioBuffer[bufferIndex++];
      cUnicodeBlock = Character.UnicodeBlock.of(c);				
			
      nextChar = ioBuffer[bufferIndex];
      nextCharUnicodeBlock = Character.UnicodeBlock.of(nextChar);
      boolean isSameUnicodeBlock = cUnicodeBlock.toString().equalsIgnoreCase(
          nextCharUnicodeBlock.toString());		
		
      if (cUnicodeBlock == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
        tokenType = "double";
        if (word.length() == 0) {
          word.append(c);
          // 增强部分--开始
          if (word.length() != 0 && (!isSameUnicodeBlock)) {
            break;
          }
          // 增强部分--结束
         } else {
             String temp = (word.toString() + c).intern();
             if (dictionary.containsKey(temp)) {
               word.append(c);
               // 增强部分--开始
               if (word.length() != 0 && (!isSameUnicodeBlock)) {
                 break;
               }
               // 增强部分--结束
             } else {
                bufferIndex--;
                offset--;
                break;
             }
          }
       } else if (cUnicodeBlock == Character.UnicodeBlock.BASIC_LATIN) {
         tokenType = "single";
         if (Character.isWhitespace(c)) {
           if (word.length() != 0)
               break;
         } else {
             word.append(c);
             // 增强部分--开始
             if (word.length() != 0 && (!isSameUnicodeBlock)) {
               break;
             }
             // 增强部分--结束
         }
       }
    }

    Token token = new Token(word.toString(), offset
        - word.length(), offset, tokenType);
    word.setLength(0);
    return token;
  }

  public void loadWords() {
    if (dictionary == null) {
      dictionary = new TreeMap<String, String>();
			
      InputStream is = null;
      InputStreamReader isr = null;
      BufferedReader br = null;
				
      try {
        //String path = System.getProperty("user.dir");
        //System.out.println(path);
        is = new FileInputStream(dictionaryPath);
        isr = new InputStreamReader(is, "UTF-8");
        br = new BufferedReader(isr);
				
        String word = null;

        while ((word = br.readLine()) != null) {	
					
          int wordLength = word.length();
					
          if ((word.indexOf("#") == -1) && (wordLength <= WORD_MAX_LENGTH)) {
						
            dictionary.put(word.intern(), "1");
						
            int i = wordLength-1;
            while(i >= 2){
              String temp = word.substring(0, i).intern();
              if (!dictionary.containsKey(temp)) {
                dictionary.put(temp,"2");
              }
              i--;
            }						
          }
        }				
      } catch (IOException e) {
        e.printStackTrace();
      }finally{
        try {
          if(br!=null){
            br.close();
          }
          if(isr!=null){
            isr.close();
          }
          if(is!=null){
            is.close();
          }					
        } catch (IOException e) {
          e.printStackTrace();
        }				
      }
    }
  }
}
💿 文件大小 3639 K
👤 上传用户 gggic
📂 所属分类 Java编程
🏷️ 相关标签

#lucene #引擎
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -