⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dict.cs

📁 KTDictSeg 简介: KTDictSeg 是由KaiToo搜索开发的一款基于字典的简单中英文分词算法 * 主要功能: 中英文分词
💻 CS
字号:
using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.IO;
using FTAlgorithm.General;

namespace KTDictSeg
{
    [Serializable]
    public class T_DictFile
    {
        public List<T_DictStruct> Dicts = new List<T_DictStruct>();
    }

    [Serializable]
    public class T_DictStruct
    {
        /// <summary>
        /// 单词
        /// </summary>
        public String Word;

        /// <summary>
        /// 词性
        /// </summary>
        public int Pos;

        /// <summary>
        /// 词频
        /// </summary>
        public double Frequency;

        public override string ToString()
        {
            return Word;
        }
    }

    public class Dict
    {
        /// <summary>
        /// 从文本文件读取字典
        /// </summary>
        /// <param name="fileName"></param>
        static public T_DictFile LoadFromTextDict(String fileName)
        {
            T_DictFile dictFile = new T_DictFile();

            String dictStr = CFile.ReadFileToString(fileName, "utf-8");

            String[] words = CRegex.Split(dictStr, "\r\n");

            foreach (String word in words)
            {
                String[] wp = CRegex.Split(word, @"\|");

                if (wp == null)
                {
                    continue;
                }

                if (wp.Length != 2)
                {
                    continue;
                }

                int pos = 0;

                try
                {
                    pos = int.Parse(wp[1]);
                }
                catch
                {
                    continue;
                }

                T_DictStruct dict = new T_DictStruct();
                dict.Word = wp[0];
                dict.Pos = pos;

                if (dict.Word.Contains("一") || dict.Word.Contains("二") ||
                    dict.Word.Contains("三") || dict.Word.Contains("四") ||
                    dict.Word.Contains("五") || dict.Word.Contains("六") ||
                    dict.Word.Contains("七") || dict.Word.Contains("八") ||
                    dict.Word.Contains("九") || dict.Word.Contains("十"))
                {
                    dict.Pos |= (int)T_POS.POS_A_M;
                }

                if (dict.Word == "字典")
                {
                    dict.Pos = (int)T_POS.POS_D_N;
                }
            
                dictFile.Dicts.Add(dict);
            }

            return dictFile;
        }

        static public void SaveToTextFile(String fileNmae, T_DictFile dictFile)
        {
            if (dictFile.Dicts == null)
            {
                return;
            }

            StringBuilder dictStr = new StringBuilder();

            foreach (T_DictStruct dict in dictFile.Dicts)
            {
                dictStr.AppendFormat("{0}|{1}\r\n", dict.Word, dict.Pos);
            }

            CFile.WriteString(fileNmae, dictStr.ToString(), "utf-8");
        }

        static public void SaveToBinFile(String fileName, T_DictFile dictFile)
        {
            Stream s = CSerialization.SerializeBinary(dictFile);
            s.Position = 0;
            CFile.WriteStream(fileName, (MemoryStream)s);
        }

        static public T_DictFile LoadFromBinFile(String fileName)
        {
            MemoryStream s = CFile.ReadFileToStream(fileName);
            s.Position = 0;
            object obj;
            CSerialization.DeserializeBinary(s, out obj);
            return (T_DictFile)obj;
        }

        static public void SaveToBinFileEx(String fileName, T_DictFile dictFile)
        {
            FileStream fs = new FileStream(fileName, FileMode.Create);
            byte[] version = new byte[32];

            int i = 0;
            foreach (byte v in System.Text.Encoding.UTF8.GetBytes("KTDictSeg Dict V1.3"))
            {
                version[i] = v;
                i++;
            }

            fs.Write(version, 0, version.Length);

            foreach (T_DictStruct dict in dictFile.Dicts)
            {
                byte[] word = System.Text.Encoding.UTF8.GetBytes(dict.Word);
                byte[] pos = System.BitConverter.GetBytes(dict.Pos);
                byte[] frequency = System.BitConverter.GetBytes(dict.Frequency);
                byte[] length = System.BitConverter.GetBytes(word.Length + frequency.Length + pos.Length);

                fs.Write(length, 0, length.Length);
                fs.Write(word, 0, word.Length);
                fs.Write(pos, 0, pos.Length);
                fs.Write(frequency, 0, frequency.Length);
            }

            fs.Close();
        }

        static public T_DictFile LoadFromBinFileEx(String fileName)
        {
            T_DictFile dictFile = new T_DictFile();
            dictFile.Dicts = new List<T_DictStruct>();

            FileStream fs = new FileStream(fileName, FileMode.Open);

            byte[] version = new byte[32];
            fs.Read(version, 0, version.Length);
            String ver = Encoding.UTF8.GetString(version, 0, version.Length);

            String verNumStr = CRegex.GetMatch(ver, "KTDictSeg Dict V(.+)", true);

            if (verNumStr == null || verNumStr == "")
            {
                //1.3以前版本

                fs.Close();
                return LoadFromBinFile(fileName);
            }

            while (fs.Position < fs.Length)
            {
                byte[] buf = new byte[sizeof(int)];
                fs.Read(buf, 0, buf.Length);
                int length = BitConverter.ToInt32(buf, 0);

                buf = new byte[length];

                T_DictStruct dict = new T_DictStruct();

                fs.Read(buf, 0, buf.Length);

                dict.Word = Encoding.UTF8.GetString(buf, 0, length - sizeof(int) - sizeof(double));
                dict.Pos = BitConverter.ToInt32(buf, length - sizeof(int) - sizeof(double));
                dict.Frequency = BitConverter.ToDouble(buf, length - sizeof(double));
                dictFile.Dicts.Add(dict);
            }

            fs.Close();

            return dictFile;
        }

    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -