⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 extractwords.cs

📁 KTDictSeg 简介: KTDictSeg 是由KaiToo搜索开发的一款基于字典的简单中英文分词算法 * 主要功能: 中英文分词
💻 CS
📖 第 1 页 / 共 2 页
字号:
/***************************************************************************************
 * KTDictSeg 简介: KTDictSeg 是由KaiToo搜索开发的一款基于字典的简单中英文分词算法
 * 主要功能: 中英文分词,未登录词识别,多元歧义自动识别,全角字符识别能力
 * 主要性能指标:
 * 分词准确度:90%以上(有待专家的权威评测)
 * 处理速度: 600KBytes/s
 * 
 * 版本: V1.2.02 
 * Copyright(c) 2007 http://www.kaitoo.com 
 * 作者:肖波
 * 授权: 开源GPL
 * 公司网站: http://www.kaitoo.com
 * 个人博客: http://blog.csdn.net/eaglet; http://www.cnblogs.com/eaglet
 * 联系方式: blog.eaglet@gmail.com
 * ***************************************************************************************/

using System;
using System.Collections.Generic;
using System.Collections;
using System.Text;
using System.Diagnostics;


namespace FTAlgorithm
{
    public enum T_Direction
    {
        /// <summary>
        /// 从左到右
        /// </summary>
        LeftToRight = 0,

        /// <summary>
        /// 从右到左
        /// </summary>
        RightToLeft = 1,
    }

    /// <summary>
    /// 单词信息
    /// </summary>
    public class T_WordInfo
    {
        /// <summary>
        /// 单词
        /// </summary>
        public String Word;

        /// <summary>
        /// 单词首字符在全文中的位置
        /// </summary>
        public int Position;

        /// <summary>
        /// 单词的权重级别
        /// </summary>
        public int Rank;

        /// <summary>
        /// 单词对应的标记
        /// </summary>
        public object Tag;
    }

    public delegate bool CompareByPosFunc(List<T_WordInfo> words, List<int> pre, List<int> cur);
    public delegate bool SelectByFreqFunc(List<T_WordInfo> words, List<int> pre, List<int> cur);

    /// <summary>
    /// 从全文中提取指定的单词,及其位置
    /// </summary>
    public class CExtractWords
    {
        CWordDfa m_WordDfa;
        List<int> m_GameNodes;
        int m_MinSpace;
        int m_MinDeep;

        T_Direction m_MatchDirection;
        CompareByPosFunc m_CompareByPos;
        SelectByFreqFunc m_SelectByFreq;

        public CompareByPosFunc CompareByPosEvent
        {
            get
            {
                return m_CompareByPos;
            }

            set
            {
                m_CompareByPos = value;
            }
        }

        public SelectByFreqFunc SelectByFreqEvent
        {
            get
            {
                return m_SelectByFreq;
            }

            set
            {
                m_SelectByFreq = value;
            }
        }


        /// <summary>
        /// 匹配方向
        /// </summary>
        public T_Direction MatchDirection
        {
            get
            {
                return m_MatchDirection;
            }

            set
            {
                m_MatchDirection = value;
            }
        }

        public CExtractWords()
        {
            m_MatchDirection = T_Direction.LeftToRight;
            m_WordDfa = new CWordDfa();
        }

        public object GetTag(String word)
        {
            return m_WordDfa.GetTag(word);
        }

        public void InsertWordToDfa(String word, object tag)
        {
            m_WordDfa.InsertWordToDfa(word, tag);
        }


        private bool CompareGroup(List<T_WordInfo> words, List<int> pre, List<int> cur, T_Direction direction)
        {
            int i ;

            if (direction == T_Direction.LeftToRight)
            {
                i = 0;
            }
            else
            {
                i = cur.Count - 1;
            }


            while ((direction == T_Direction.LeftToRight && i < cur.Count) ||
                (direction == T_Direction.RightToLeft && i >= 0))

            {
                if (i >= pre.Count)
                {
                    break;
                }

                int preId = (int)pre[i];
                int curId = (int)cur[i];

                if (((T_WordInfo)words[curId]).Word.Length > ((T_WordInfo)words[preId]).Word.Length)
                {
                    return true;
                }
                else if (((T_WordInfo)words[curId]).Word.Length < ((T_WordInfo)words[preId]).Word.Length)
                {
                    return false;
                }

                if (direction == T_Direction.LeftToRight)
                {
                    i++;
                }
                else
                {
                    i--;
                }
            }

            return false;
        }

        /// <summary>
        /// 博弈树
        /// </summary>
        /// <param name="words"></param>
        /// <param name="nodes"></param>
        /// <param name="init"></param>
        /// <param name="begin"></param>
        /// <param name="end"></param>
        /// <param name="spaceNum"></param>
        /// <param name="deep"></param>
        /// <returns></returns>
        private List<int> GameTree(List<T_WordInfo> words, List<int> nodes, bool init, int begin, int end, ref int spaceNum, ref int deep)
        {
            if (init)
            {
                int startPos = ((T_WordInfo)words[begin]).Position;
                for (int i = begin; i <= end ; i++) 
                {
                    T_WordInfo wordInfo = (T_WordInfo)words[i];
                    spaceNum = wordInfo.Position - startPos;
                    deep = 0;
                    List<int> oneNodes;

                    if (i == end)
                    {
                        oneNodes = new List<int>();
                        oneNodes.Add(i);
                        deep++;
                    }
                    else
                    {
                        oneNodes = GameTree(words, nodes, false, i, end, ref spaceNum, ref deep);
                    }

                    if (oneNodes != null)
                    {
                        bool select = false;

                        if (m_MinSpace > spaceNum ||
                            (m_MinSpace == spaceNum && deep < m_MinDeep))
                        {
                            select = true;

                            if (m_MinSpace == 0)
                            {
                                if (SelectByFreqEvent != null)
                                {
                                    select = SelectByFreqEvent(words, m_GameNodes, oneNodes);
                                }
                            }

                        }
                        else if (m_MinDeep == deep && m_MinSpace == spaceNum)
                        {
                            if (m_CompareByPos != null && m_MinSpace == 0)
                            {
                                select = m_CompareByPos(words, m_GameNodes, oneNodes);
                            }
                            else
                            {
                                select = CompareGroup(words, m_GameNodes, oneNodes, MatchDirection);
                            }
                        }


                        if (select)
                        {
                            m_MinDeep = deep;
                            m_MinSpace = spaceNum;
                            m_GameNodes.Clear();
                            foreach (int obj in oneNodes)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -