📄 matchchineseanalyzer.java

📁 基于最大匹配算法的的中文分词组件
💻 JAVA
字号:
/*
 * @作者:Hades , 创建日期:2007-1-25
 *
 * 汕头大学03计算机本科
 * 
 */
package edu.stu.cn.lucene.analysis;

import java.io.Reader;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;

import edu.stu.cn.segment.matching.processor.SegmentProcessorImpl;

/**
 * @author Hades Guan 基于正向最大匹配的Lucene分析器
 */
public class MatchChineseAnalyzer extends Analyzer
{
    /**
     * 一些不常用于搜索的词汇
     */
    public final static String[] STOP_WORDS =
    { "a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in",
            "into", "is", "it", "no", "not", "of", "on", "or", "s", "such",
            "t", "that", "the", "their", "then", "there", "these", "they",
            "this", "to", "was", "will", "with", "", "www" };

    /**
     * 分词构件
     */
    private SegmentProcessorImpl processor = null;

    /**
     * 分隔符列表
     */
    private Set stopTable;

    /**
     * 使用默认分隔符的构造函数
     * 
     * @param processor
     *            需要使用分词构件
     */
    public MatchChineseAnalyzer(SegmentProcessorImpl processor)
    {
        this.processor = processor;
        this.stopTable = StopWordMaker.retreive();
    }

    /**
     * 使用指定分隔符的构造函数
     * 
     * @param processor
     *            需要使用分词构件
     * @param stopWords
     *            指定的分隔符
     */
    public MatchChineseAnalyzer(SegmentProcessorImpl processor,
            String[] stopWords)
    {
        this.processor = processor;
        this.stopTable = StopFilter.makeStopSet(stopWords);
    }

    /**
     * @return 返回 processor。
     */
    public SegmentProcessorImpl getProcessor()
    {
        return processor;
    }

    /**
     * @param processor
     *            要设置的 processor。
     */
    public void setProcessor(SegmentProcessorImpl processor)
    {
        this.processor = processor;
    }

    /*
     * （非 Javadoc）
     * 
     * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String,
     *      java.io.Reader)
     */
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader)
    {
        return new StopFilter(
                new MatchChineseTokenizer(this.processor, reader),
                this.stopTable);
    }

}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -