⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 seglist.cs

📁 c#常用类库大全
💻 CS
📖 第 1 页 / 共 3 页
字号:
            string strline = reader.ReadLine();

            SegList list;
            Hashtable child = new Hashtable();
            
            long i = 0;
            while (strline != null && strline.Trim() != "")
            {
                i++;
                strChar1 = strline.Substring(0, 1);
                strChar2 = strline.Substring(1, 1);
                if (!htWords.ContainsKey(strChar1))
                {
                    father = new Hashtable();
                    htWords.Add(strChar1, father);
                }
                else
                {
                    father = (Hashtable)htWords[strChar1];
                }

                if (!father.ContainsKey(strChar2))
                {
                    list = new SegList();
                    if (strline.Length > 2)
                        list.Add(strline.Substring(2));
                    else
                        list.Add("null");
                    father.Add(strChar2, list);
                }
                else
                {
                    list = (SegList)father[strChar2];
                    if (strline.Length > 2)
                    {
                        list.Add(strline.Substring(2));
                    }
                    else
                    {
                        list.Add("null");
                    }
                    father[strChar2] = list;
                }
                htWords[strChar1] = father;
                strline = reader.ReadLine();
            }
            try
            {
                reader.Close();
            }
            catch
            { }
            SetCache("jcms_dict", htWords);
        }
        htWords = (Hashtable)GetCache("jcms_dict");

        alNoise =  LoadWords(NoisePath, alNoise);
        alNumber = LoadWords(NumberPath, alNumber);
        alWord =   LoadWords(WordPath, alWord);
        alPrefix = LoadWords(PrefixPath, alPrefix);

        TimeSpan duration = DateTime.Now - start;
        m_EventTime = duration.TotalMilliseconds;
    }

    /// <summary>
    /// 加载文本词组到ArrayList
    /// </summary>
    public ArrayList LoadWords(string strPath, ArrayList list)
    {
        StreamReader reader = new StreamReader(strPath, System.Text.Encoding.UTF8);
        list = new ArrayList();
        string strline = reader.ReadLine();
        while (strline != null)
        {
            list.Add(strline);
            strline = reader.ReadLine();
        }
        try
        {
            reader.Close();
        }
        catch
        { }
        return list;
    }

    /// <summary>
    /// 输出词列表
    /// </summary>
    public void OutWords()
    {
        IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
        while (idEnumerator1.MoveNext())
        {
            IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
            while (idEnumerator2.MoveNext())
            {
                SegList aa = (SegList)idEnumerator2.Value;
                for (int i = 0; i < aa.Count; i++)
                {
                    Console.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
                }
            }
        }
    }

    /// <summary>
    /// 输出ArrayList
    /// </summary>
    public void OutArrayList(ArrayList list)
    {
        if (list == null) return;
        for (int i = 0; i < list.Count; i++)
        {
            Console.WriteLine(list[i].ToString());
        }
    }

    /// <summary>
    /// 分词过程,不支持回车 
    /// </summary>
    /// <param name="strText">要分词的文本</param>
    /// <returns>分词后的文本</returns>
    public string SegmentText(string strText)
    {
        strText = (strText + "$").Trim();
        if (htWords == null)    return strText;
        if (strText.Length < 3) return strText;
        DateTime start = DateTime.Now;
        int length = 0;
        int preFix = 0;
        bool word = false;
        bool number = false;
        string reText = "";
        string strPrefix = "";
        string strLastChar = "";
        string strLastWords = Separator;

        for (int i = 0; i < strText.Length - 1; i++)
        {
            #region 对于每一个字的处理过程
            string strChar1 = strText.Substring(i, 1);
            string strChar2 = strText.Substring(i + 1, 1).Trim();
            bool yes;
            SegList l;
            Hashtable h;

            if (reText.Length > 0) strLastChar = reText.Substring(reText.Length - 1);

            if (strChar1 == " ")
            {
                if ((number || word) && strLastChar != Separator) reText += this.Separator;
                yes = true;
            }
            else
                yes = false;

            int CharType = GetCharType(strChar1);
            switch (CharType)
            {
                case 1:
                    #region  如果是数字,如果数字的上一位是字母要和后面的数字分开
                    if (word)
                    {
                        reText += Separator;
                    }
                    word = false;
                    number = true;
                    strLastWords = "";
                    break;
                    #endregion
                case 2:
                case 5:
                    #region 如果是字母
                    if (number)
                        strLastWords = Separator;
                    else
                        strLastWords = "";

                    word = true;
                    number = false;
                    break;
                    #endregion
                case 3:
                case 4:
                    #region 第一级哈希表是否包含关键字,假如包含处理第二级哈希表
                    //上一个字是否为字母
                    if (word) reText += Separator;

                    #region 检测上一个是否是数字,这个过程是用于修正数字后的量词的
                    if (number && CharType != 4)
                    {
                        h = (Hashtable)htWords["n"];
                        if (h.ContainsKey(strChar1))
                        {
                            l = (SegList)h[strChar1];
                            if (l.Contains(strChar2))
                            {
                                reText += strChar1 + strChar2 + Separator;
                                yes = true;
                                i++;
                            }
                            else if (l.Contains("null"))
                            {
                                reText += strChar1 + Separator;
                                yes = true;
                            }
                        }
                        else
                            reText += Separator;
                    }
                    #endregion

                    //非汉字数字的汉字
                    if (CharType == 3)
                    {
                        word = false;
                        number = false;
                        strLastWords = Separator;
                    }
                    else
                    {
                        word = false;
                        number = true;
                        strLastWords = "";
                    }

                    //第二级哈希表取出
                    h = (Hashtable)htWords[strChar1];

                    //第二级哈希表是否包含关键字
                    if (h.ContainsKey(strChar2))
                    {
                        #region  第二级包含关键字
                        //取出ArrayList对象
                        l = (SegList)h[strChar2];

                        //遍历每一个对象 看是否能组合成词
                        for (int j = 0; j < l.Count; j++)
                        {
                            bool have = false;
                            string strChar3 = l.GetElem(j).ToString();

                            //对于每一个取出的词进行检测,看是否匹配,长度保护
                            if ((strChar3.Length + i + 2) < strText.Length)
                            {
                                //向i+2后取出m长度的字
                                string strChar = strText.Substring(i + 2, strChar3.Length).Trim();
                                if (strChar3 == strChar && !yes)
                                {
                                    if (strPrefix != "")
                                    {
                                        reText += strPrefix + Separator;
                                        strPrefix = "";
                                        preFix = 0;
                                    }
                                    reText += strChar1 + strChar2 + strChar;
                                    i += strChar3.Length + 1;
                                    have = true;
                                    yes = true;
                                    break;
                                }
                            }
                            else if ((strChar3.Length + i + 2) == strText.Length)
                            {
                                string strChar = strText.Substring(i + 2).Trim();
                                if (strChar3 == strChar && !yes)
                                {
                                    if (strPrefix != "")
                                    {
                                        reText += strPrefix + Separator;
                                        strPrefix = "";
                                        preFix = 0;
                                    }
                                    reText += strChar1 + strChar2 + strChar;
                                    i += strChar3.Length + 1;
                                    have = true;
                                    yes = true;
                                    break;
                                }
                            }

                            if (!have && j == l.Count - 1 && l.Contains("null") && !yes)
                            {
                                if (preFix == 1)
                                {
                                    reText += strPrefix + strChar1 + strChar2;
                                    strPrefix = "";
                                    preFix = 0;
                                }
                                else if (preFix > 1)
                                {
                                    reText += strPrefix + strLastWords + strChar1 + strChar2;
                                    strPrefix = "";
                                    preFix = 0;
                                }
                                else
                                {
                                    if (CharType == 4) reText += strChar1 + strChar2;
                                    else reText += strChar1 + strChar2;
                                    strLastWords = this.Separator;
                                    number = false;
                                }
                                i++;
                                yes = true;
                                break;
                            }
                            else if (have)
                            {
                                break;
                            }
                        }
                        #endregion

                        //如果没有匹配还可能有一种情况,这个词语只有两个字,以这两个字开头的词语不存在
                        if (!yes && l.Contains("null"))

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -