📄 matchsegmentprocessor.java

📁 基于最大匹配算法的的中文分词组件
💻 JAVA
字号:
/*
 * @作者:Hades , 创建日期:2006-11-17
 *
 * 汕头大学03计算机本科
 * 
 */
package edu.stu.cn.segment.matching.processor;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.LinkedList;

import edu.stu.cn.segment.matching.dictionary.DictionaryImpl;

/**
 * @author Hades Guan 基于词典匹配的中文分词抽象类
 */
public abstract class MatchSegmentProcessor implements SegmentProcessorImpl
{

    /**
     * 词典操作类
     */
    protected DictionaryImpl dic = null;

    /**
     * 分隔符字符串
     */
    protected String seperator = null;

    /**
     * 英文数字字符集
     */
    protected final String CHAR_AND_NUM = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";

    /**
     * 初始化分隔符的方法
     */
    protected void initSeperator()
    {
        // 初始化分隔符
        StringBuffer buffer = new StringBuffer();
        for (char c = '\u0000'; c <= '\u007F'; c++)
        {
            // 不过滤英文、数字字符
            if (this.CHAR_AND_NUM.indexOf(c) < 0)
                buffer.append(c);
        }
        for (char c = '\uFF00'; c <= '\uFFEF'; c++)
            buffer.append(c);
        buffer.append(" \r\n《》？，。、：“；‘’”『』【】－―—─＝÷＋§·～！◎＃￥％…※×（）　");
        this.seperator = buffer.toString();
    }

    /**
     * 对srcFile文件进行分词，把结果保存为到tagFile文件中
     * 
     * @param srcFile
     *            待分词的文本文件
     * @param tagFile
     *            分词结果保存目的文件
     */
    public void fileProcessor(String srcFile, String tagFile)
    {
        try
        {
            // 初始化输入输出
            BufferedReader in = new BufferedReader(new FileReader(srcFile));
            PrintWriter out = new PrintWriter(new BufferedWriter(
                    new FileWriter(tagFile)));

            // 读入文件
            String line = null;
            StringBuffer buffer = new StringBuffer();
            while ((line = in.readLine()) != null)
            {
                buffer.append(line);
                buffer.append("\n");
            }
            // 关闭输入
            in.close();

            // 分词处理
            LinkedList<String> result = this.textProcess(buffer.toString()
                    .toLowerCase().trim());

            // 将结果写入文件
            for (String w : result)
                out.print(w + " ");
            // 关闭输出
            out.flush();
            out.close();
        }
        catch (FileNotFoundException e)
        {
            // TODO 自动生成 catch 块
            e.printStackTrace();
        }
        catch (IOException e)
        {
            // TODO 自动生成 catch 块
            e.printStackTrace();
        }
    }

    /**
     * @return 返回 dic。
     */
    public DictionaryImpl getDic()
    {
        return dic;
    }

    /**
     * @param dic
     *            要设置的 dic。
     */
    public void setDic(DictionaryImpl dic)
    {
        this.dic = dic;
    }

    /**
     * 对text文本进行分词，把结果保存为字符串链表
     * 
     * @param text
     *            待分词的文本
     * @return 分词结果
     */
    abstract public LinkedList<String> textProcess(String text);
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -