⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 program.cs

📁 应用FCM(模糊c均值聚类)算法到文本聚类 采用两种方法计算文本相似度 采用ShootSeg分词 采用sogou互联网词库简化特征值计算
💻 CS
📖 第 1 页 / 共 2 页
字号:
using System;
using System.Collections.Generic;
using System.Windows.Forms;
using System.Drawing;
using System.Collections;
using System.Drawing.Drawing2D;
using System.ComponentModel;
using System.Data;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using ShootSeg;

namespace textFCM
{
    static class Program
    {
        /// <summary>
        /// 应用程序的主入口点。
        /// </summary>
        [STAThread]
        static void Main()
        {
            Application.EnableVisualStyles();
            Application.SetCompatibleTextRenderingDefault(false);
            Application.Run(new Form1());
        }
    }
    public class Document
    {
        LoadDict LD = LoadDict.getDict();

        public Dictionary<string, Word> MyWord = new Dictionary<string, Word>();
        public Dictionary<string, Word> MyWords = new Dictionary<string, Word>();
        public Dictionary<Document, Dictionary<string, Word>> Words = new Dictionary<Document, Dictionary<string, Word>>();
        public ArrayList Documents = new ArrayList();

        public Document()
        { }
        public Document(string[] fileNames)
        {
            int n_numpattern = fileNames.Length;
            for (int i = 0; i < n_numpattern; i++)
            {
                Document Doc = new Document(fileNames[i]);
                foreach (string str in Doc.MyWord.Keys)
                {
                    if (!MyWords.ContainsKey(str))
                    {
                        Word W = new Word(str);
                        addWord(W);
                    }
                }
                Documents.Add(Doc);
                Words.Add(Doc, Doc.MyWord);
            }
        }
        public Document(string fileName)
        {
            this._fileName = fileName;
            string strFile = IOControl.ReadFileUsingDefault(fileName);
            _length = strFile.Length;
            Segment seg = new Segment();
            seg.InitWordDics();
            seg.Separator = "/";
            string[] tempWord = seg.SegmentText(strFile, true).Split(new char[] { '/' });

            double num = 0;
            double num2;
            long D = 0x5f5e100;
            long Dw = 0;

            //计算词频
            foreach (string str in tempWord)
            {
                if (!MyWord.ContainsKey(str))
                {
                    Regex regex = new Regex("[\u4e00-\u9fa5]");  //中文
                    if (regex.IsMatch(str))
                    {
                        Word W = new Word(str);
                        W.WordFrequency = 1;
                        addWord(W);
                    }
                }
                else
                {
                    Word W = getWordByKey(str);
                    W.WordFrequency += 1;
                }
            }
            //计算词的特征值
            List<string> NoNeedWord = new List<string>();
            foreach (string str in MyWord.Keys)
            {
                if (LD.sogou.ContainsKey(str))
                {
                    num = ((double)(MyWord[str].WordFrequency)) / ((double)_length);
                    num2 = 0;
                    Dw = int.Parse(LD.sogou[str].ToString());
                    num2 = Math.Abs(Math.Log(((double)D) / ((double)Dw)));
                    Word W = getWordByKey(str);
                    W.CharacterValue = num * num2;
                }
                else
                {
                    NoNeedWord.Add(str);
                }
            }
            //去掉无用词
            foreach (string str in NoNeedWord)
            {
                deleteWord(str);
            }
        }

        public double SimilitudeValueToDocumentUsingCos(Document Doc1, Document Doc2)
        {
            double num = 0;
            double d = 0;
            double num3 = 0;
            double num4 = 0;
            foreach (string str in Doc1.MyWord.Keys)
            {
                if (Doc2.MyWord.ContainsKey(str))
                {
                    num += ((double)Doc1.MyWord[str].CharacterValue) * ((double)Doc2.MyWord[str].CharacterValue);  //d1*c1 
                    d += ((double)Doc1.MyWord[str].CharacterValue) * ((double)Doc1.MyWord[str].CharacterValue);  //|d1|
                    num3 += ((double)Doc2.MyWord[str].CharacterValue) * ((double)Doc2.MyWord[str].CharacterValue);//|c1|
                    num4 += 1;
                }
            }
            if (((num4 / ((double)Doc1.MyWord.Keys.Count)) <= 0.1) || ((num4 / ((double)Doc2.MyWord.Count)) <= 0.1))
            {
                return 0;
            }
            d = Math.Sqrt(d);
            num3 = Math.Sqrt(num3);
            return (num / (d * num3));
        }

        public double UnSimilitudeValueToDocumentUsingCos(Document Doc1, Document Doc2)
        {
            return (1 - SimilitudeValueToDocumentUsingCos(Doc1, Doc2));
        }

        public double SimilitudeValueToDocumentUsingGeneralizedJaccardCoefficient(Document Doc1, Document Doc2)
        {
            double num = 0;
            double d = 0;
            double num3 = 0;
            double num4 = 0;
            foreach (string str in Doc1.MyWord.Keys)
            {
                if (Doc2.MyWord.ContainsKey(str))
                {
                    num += ((double)Doc1.MyWord[str].CharacterValue) * ((double)Doc2.MyWord[str].CharacterValue);  //d1*c1 
                    d += ((double)Doc1.MyWord[str].CharacterValue) * ((double)Doc1.MyWord[str].CharacterValue);  //|d1|
                    num3 += ((double)Doc2.MyWord[str].CharacterValue) * ((double)Doc2.MyWord[str].CharacterValue);//|c1|
                    num4 += 1;
                }
            }
            if (((num4 / ((double)Doc1.MyWord.Keys.Count)) <= 0.1) || ((num4 / ((double)Doc2.MyWord.Count)) <= 0.1))
            {
                return 0;
            }
            return (num / (d + num3 - num));
        }

        public double UnSimilitudeValueToDocumentUsingGeneralizedJaccardCoefficient(Document Doc1, Document Doc2)
        {
            return (1 - SimilitudeValueToDocumentUsingGeneralizedJaccardCoefficient(Doc1, Doc2));
        }

        private string _fileName;

        public string FileName
        {
            get { return _fileName; }
            set { _fileName = value; }
        }
        private int _length;

        public int Length
        {
            get { return _length; }
            set { _length = value; }
        }
        public bool deleteWord(string key)
        {
            return MyWord.Remove(key);
        }
        public void addWord(string key)
        {
            if (MyWord.ContainsKey(key))
            {
                return;
            }
            else
            {
                MyWord.Add(key, new Word(key));
            }
        }
        public void addWord(Word word)
        {
            if (MyWord.ContainsKey(word.Key))
            {
                return;
            }
            else
            {
                MyWord.Add(word.Key, word);
            }
        }
        public Word getWordByKey(string key)
        {
            if (MyWord.ContainsKey(key))
            {
                return MyWord[key];
            }
            else
            {
                return null;
            }
        }
        public Dictionary<string, Word>.KeyCollection getAllWordKeys()
        {
            return MyWord.Keys;
        }
    }

    class IOControl
    {
        public static string ReadFileUsingDefault(string FileName)
        {
            StreamReader sr = new StreamReader(FileName, Encoding.Default);
            string temp = sr.ReadToEnd();
            sr.Close();
            return temp;
        }
    }

    public class LoadDict
    {
        private Dictionary<string, string> _sogou = new Dictionary<string, string>();
        private Dictionary<string, string> _sogou1 = new Dictionary<string, string>();
        private static bool instance_flag = false;
        ///加载词库
        private LoadDict()
        {
            StreamReader reader = new StreamReader(Environment.CurrentDirectory+"\\data\\SogouLabDic.dic", Encoding.Default);
            string text = null;
            while ((text = reader.ReadLine()) != null)
            {
                char[] separator = new char[] { '\t' };
                string[] textArray = text.Split(separator);
                this.sogou.Add(textArray[0], textArray[1]);
                this.sogou1.Add(textArray[0], textArray[2]);
            }
        }
        public static LoadDict getDict()
        {
            if (!instance_flag)
                return new LoadDict();
            else
                return null;
        }

        public Dictionary<string, string> sogou
        {
            get
            {
                return this._sogou;
            }
            set
            {
                this._sogou = this.sogou;
            }
        }

        public Dictionary<string, string> sogou1
        {
            get

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -