⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 document.cs

📁 广义Jaccard系数(Tanimoto系数):是对Jaccard系数的扩展
💻 CS
字号:
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
namespace ClusterUsingKmeans
{
     public  class Document
     {
         LoadDict LD = LoadDict.getDict();
         private Dictionary<string, Word> MyWord;
         public Document()
         { }
         public Document(string fileName)
         {
             MyWord = new Dictionary<string, Word>();
             this._fileName = fileName;
             string strFile = IOControl.ReadFileUsingDefault(fileName);
             _length = strFile.Length;
             string[] tempWord = WordSegment.SegmentWord(strFile);
             double num;
             double num2;
             long D = 0x5f5e100;   
             long Dw = 0;

             //计算词频
             foreach (string str in tempWord)
             {
                 if (!MyWord.ContainsKey(str))
                 {
                     Regex regex = new Regex("[\u4e00-\u9fa5]");  //中文
                     if (regex.IsMatch(str))
                     {
                         Word W = new Word(str);
                         W.WordFrequency = 1;
                         addWord(W);
                     }
                 }
                 else
                 {
                     Word W = getWordByKey(str);
                     W.WordFrequency += 1;
                 }
             }
             //计算词的特征值
             List<string> NoNeedWord = new List<string>();
             foreach (string str in MyWord.Keys)
             {
                 if (LD.sogou.ContainsKey(str))
                 {
                     num = ((double)(MyWord[str].WordFrequency)) / ((double)_length); 
                     num2 = 0;
                         Dw = int.Parse(LD.sogou[str].ToString()); 
                         num2 = Math.Abs(Math.Log(((double)D) / ((double)Dw))); 
                         Word W = getWordByKey(str);
                         W.CharacterValue = num * num2;

                 }
                 else
                 {
                     NoNeedWord.Add(str);
                 }
             }
             //去掉无用词
             foreach (string str in NoNeedWord)
             {
                 deleteWord(str);
             }
         }

         public double SimilitudeValueToDocumentUsingCos(Document Doc)
         {
             double num = 0;
             double d = 0;
             double num3 = 0;
             double num4 = 0;
             foreach (string str in MyWord.Keys)
             {
                 if (Doc.MyWord.ContainsKey(str))
                 {
                     //  a
                     num += ((double)MyWord[str].CharacterValue) * ((double)Doc.MyWord[str].CharacterValue);  //d1*c1 
                     d += ((double)MyWord[str].CharacterValue) * ((double)MyWord[str].CharacterValue);  //|d1|
                     num3 += ((double)Doc.MyWord[str].CharacterValue) * ((double)Doc.MyWord[str].CharacterValue);//|c1|
                     num4 += 1;
                 }
             }
             if (((num4 / ((double)MyWord.Keys.Count)) <= 0.1) || ((num4 / ((double)Doc.MyWord.Count)) <= 0.1))
             {
                 return 0;
             }
             d = Math.Sqrt(d);
             num3 = Math.Sqrt(num3);
             return (num / (d * num3));
         }
         public double SimilitudeValueToDocumentUsingGeneralizedJaccardCoefficient(Document Doc)
         {
             double num = 0;
             double d = 0;
             double num3 = 0;
             double num4 = 0;
             foreach (string str in MyWord.Keys)
             {
                 if (Doc.MyWord.ContainsKey(str))
                 {
                     //  a
                     num += ((double)MyWord[str].CharacterValue) * ((double)Doc.MyWord[str].CharacterValue);  //d1*c1 
                     d += ((double)MyWord[str].CharacterValue) * ((double)MyWord[str].CharacterValue);  //|d1|
                     num3 += ((double)Doc.MyWord[str].CharacterValue) * ((double)Doc.MyWord[str].CharacterValue);//|c1|
                     num4 += 1;
                 }
             }
             if (((num4 / ((double)MyWord.Keys.Count)) <= 0.1) || ((num4 / ((double)Doc.MyWord.Count)) <= 0.1))
             {
                 return 0;
             }
             return (num / (d + num3 - num));
         }
         private string _fileName;

         public string FileName
         {
             get { return _fileName; }
             set { _fileName = value; }
         }
         private int _length;

         public int Length
         {
             get { return _length; }
             set { _length = value; }
         }
         public bool deleteWord(string key)
         {
               return MyWord.Remove(key);
         }
         public void addWord(string key)
         {
             if (MyWord.ContainsKey(key))
             {
                 return;
             }
             else
             {
                 MyWord.Add(key, new Word(key));
             }
         }
         public void addWord(Word word)
         {
             if (MyWord.ContainsKey(word.Key))
             {
                 return;
             }
             else
             {
                 MyWord.Add(word.Key, word);
             }
         }
         public Word getWordByKey(string key)
         {
             if (MyWord.ContainsKey(key))
             {
                 return MyWord[key];
             }
             else
             {
                 return null;
             }
         }
         public Dictionary<string, Word> .KeyCollection getAllWordKeys()
         {
             return MyWord.Keys;
         }
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -