📄 maxmatchsegmentprocessor.java

📁 基于最大匹配算法的的中文分词组件
💻 JAVA
字号:
/*
 * @作者:Hades , 创建日期:2006-11-17
 *
 * 汕头大学03计算机本科
 * 
 */
package edu.stu.cn.segment.matching.processor;

import java.util.LinkedList;
import java.util.StringTokenizer;

import edu.stu.cn.segment.matching.dictionary.DictionaryImpl;

/**
 * @author Hades Guan 正向最大匹配分词法操作类
 */
public class MaxMatchSegmentProcessor extends MatchSegmentProcessor
{
    /**
     * 字符串分隔器
     */
    private StringTokenizer tokenizer = null;

    /**
     * 默认构造函数
     */
    public MaxMatchSegmentProcessor()
    {
        this.initSeperator();
    }

    /**
     * 以一个词典操作类实例为参数的构造函数
     * 
     * @param dic
     *            词典操作类实例
     */
    public MaxMatchSegmentProcessor(DictionaryImpl dic)
    {
        this.dic = dic;
        this.initSeperator();
    }

    /**
     * 对text文本进行分词，把结果保存为字符串链表
     * 
     * @param text
     *            待分词的文本
     * @return 分词结果
     */
    public LinkedList<String> textProcess(String text)
    {
        if (text == null)
            return null;

        // 初始化结果链表
        LinkedList<String> result = new LinkedList<String>();
        // 对待分词文本进行分隔
        this.tokenizer = new StringTokenizer(text.toLowerCase(), this.seperator);
        // 记录将处理的字符的在字符串中的位置
        int index = 0;
        // 待分词文本长度
        int textLen = text.length();
        // 分隔文本
        while (tokenizer.hasMoreTokens())
        {
            String token = tokenizer.nextToken();
            // 修改记录位置
            index += token.length();

            // 判断分隔文本是否为null
            if (token == null)
                continue;

            // 初始化位置标签
            int pos = 0;
            // 当前分隔文本长度
            int len = token.length();
            // 结尾位置
            int end = len;
            // 循环匹配
            while (pos < len)
            {
                while (end > pos)
                {
                    // 判断end处字符是否为数字或英文字母
                    if (end > 0
                            && CHAR_AND_NUM.indexOf(token.charAt(end - 1)) >= 0)
                    {
                        // 记录英语字母开始位置、英语字母结束位置
                        int englishEnd = end, englishStart = end;
                        while (englishStart > 0
                                && CHAR_AND_NUM.indexOf(token
                                        .charAt(englishStart - 1)) >= 0)
                            englishStart--;
                        // 判断当位置标签指向当前英文串首地址时将结果插入分词结果集
                        if (pos == englishStart)
                        {
                            result.add(token
                                    .substring(englishStart, englishEnd));
                            pos = end;
                            end = len;
                        }
                    }
                    // end of 判断end处字符是否为数字或英文字母

                    // 判断分段是否已分析完毕
                    if (end > pos)
                    {
                        // 汉字处理
                        String word = token.substring(pos, end);
                        if (dic.match(word))
                        {
                            result.add(word);
                            pos = end;
                            end = len;
                        }
                        else
                        {
                            // 当判断到剩余单字时，将词加入到词库中
                            if (word.length() == 1)
                            {
                                result.add(word);
                                pos = end;
                                end = len;
                            }
                            else
                                end--;
                        }
                        // end of match
                    }
                    // end of if(end>pos)
                }
                // end of while (end > pos)
            }
            // end of while (pos < len)

            // 将分隔符加入到结果集中
            if (index < textLen)
            {
                String seperate = text.substring(index, index + 1);
                // 判断是否为分隔符
                while (this.seperator.indexOf(seperate) >= 0)
                {
                    result.add(seperate);
                    if (++index < textLen)
                        seperate = text.substring(index, index + 1);
                    else
                        break;
                }
                // end of while (this.seperator.indexOf(seperate) >= 0)
            }
            // end of if (index < textLen)
        }
        // end of while (tokenizer.hasMoreTokens())
        return result;
    }
}
💿 文件大小 32 K
👤 上传用户 szdoudou
📂 所属分类多国语言处理
🏷️ 相关标签

#匹配算法 #分
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -