📄 program.cs
字号:
using System;
using System.Collections.Generic;
using System.Windows.Forms;
using System.Drawing;
using System.Collections;
using System.Drawing.Drawing2D;
using System.ComponentModel;
using System.Data;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using ShootSeg;
namespace textFCM
{
static class Program
{
/// <summary>
/// 应用程序的主入口点。
/// </summary>
[STAThread]
static void Main()
{
Application.EnableVisualStyles();
Application.SetCompatibleTextRenderingDefault(false);
Application.Run(new Form1());
}
}
public class Document
{
LoadDict LD = LoadDict.getDict();
public Dictionary<string, Word> MyWord = new Dictionary<string, Word>();
public Dictionary<string, Word> MyWords = new Dictionary<string, Word>();
public Dictionary<Document, Dictionary<string, Word>> Words = new Dictionary<Document, Dictionary<string, Word>>();
public ArrayList Documents = new ArrayList();
public Document()
{ }
public Document(string[] fileNames)
{
int n_numpattern = fileNames.Length;
for (int i = 0; i < n_numpattern; i++)
{
Document Doc = new Document(fileNames[i]);
foreach (string str in Doc.MyWord.Keys)
{
if (!MyWords.ContainsKey(str))
{
Word W = new Word(str);
addWord(W);
}
}
Documents.Add(Doc);
Words.Add(Doc, Doc.MyWord);
}
}
public Document(string fileName)
{
this._fileName = fileName;
string strFile = IOControl.ReadFileUsingDefault(fileName);
_length = strFile.Length;
Segment seg = new Segment();
seg.InitWordDics();
seg.Separator = "/";
string[] tempWord = seg.SegmentText(strFile, true).Split(new char[] { '/' });
double num = 0;
double num2;
long D = 0x5f5e100;
long Dw = 0;
//计算词频
foreach (string str in tempWord)
{
if (!MyWord.ContainsKey(str))
{
Regex regex = new Regex("[\u4e00-\u9fa5]"); //中文
if (regex.IsMatch(str))
{
Word W = new Word(str);
W.WordFrequency = 1;
addWord(W);
}
}
else
{
Word W = getWordByKey(str);
W.WordFrequency += 1;
}
}
//计算词的特征值
List<string> NoNeedWord = new List<string>();
foreach (string str in MyWord.Keys)
{
if (LD.sogou.ContainsKey(str))
{
num = ((double)(MyWord[str].WordFrequency)) / ((double)_length);
num2 = 0;
Dw = int.Parse(LD.sogou[str].ToString());
num2 = Math.Abs(Math.Log(((double)D) / ((double)Dw)));
Word W = getWordByKey(str);
W.CharacterValue = num * num2;
}
else
{
NoNeedWord.Add(str);
}
}
//去掉无用词
foreach (string str in NoNeedWord)
{
deleteWord(str);
}
}
public double SimilitudeValueToDocumentUsingCos(Document Doc1, Document Doc2)
{
double num = 0;
double d = 0;
double num3 = 0;
double num4 = 0;
foreach (string str in Doc1.MyWord.Keys)
{
if (Doc2.MyWord.ContainsKey(str))
{
num += ((double)Doc1.MyWord[str].CharacterValue) * ((double)Doc2.MyWord[str].CharacterValue); //d1*c1
d += ((double)Doc1.MyWord[str].CharacterValue) * ((double)Doc1.MyWord[str].CharacterValue); //|d1|
num3 += ((double)Doc2.MyWord[str].CharacterValue) * ((double)Doc2.MyWord[str].CharacterValue);//|c1|
num4 += 1;
}
}
if (((num4 / ((double)Doc1.MyWord.Keys.Count)) <= 0.1) || ((num4 / ((double)Doc2.MyWord.Count)) <= 0.1))
{
return 0;
}
d = Math.Sqrt(d);
num3 = Math.Sqrt(num3);
return (num / (d * num3));
}
public double UnSimilitudeValueToDocumentUsingCos(Document Doc1, Document Doc2)
{
return (1 - SimilitudeValueToDocumentUsingCos(Doc1, Doc2));
}
public double SimilitudeValueToDocumentUsingGeneralizedJaccardCoefficient(Document Doc1, Document Doc2)
{
double num = 0;
double d = 0;
double num3 = 0;
double num4 = 0;
foreach (string str in Doc1.MyWord.Keys)
{
if (Doc2.MyWord.ContainsKey(str))
{
num += ((double)Doc1.MyWord[str].CharacterValue) * ((double)Doc2.MyWord[str].CharacterValue); //d1*c1
d += ((double)Doc1.MyWord[str].CharacterValue) * ((double)Doc1.MyWord[str].CharacterValue); //|d1|
num3 += ((double)Doc2.MyWord[str].CharacterValue) * ((double)Doc2.MyWord[str].CharacterValue);//|c1|
num4 += 1;
}
}
if (((num4 / ((double)Doc1.MyWord.Keys.Count)) <= 0.1) || ((num4 / ((double)Doc2.MyWord.Count)) <= 0.1))
{
return 0;
}
return (num / (d + num3 - num));
}
public double UnSimilitudeValueToDocumentUsingGeneralizedJaccardCoefficient(Document Doc1, Document Doc2)
{
return (1 - SimilitudeValueToDocumentUsingGeneralizedJaccardCoefficient(Doc1, Doc2));
}
private string _fileName;
public string FileName
{
get { return _fileName; }
set { _fileName = value; }
}
private int _length;
public int Length
{
get { return _length; }
set { _length = value; }
}
public bool deleteWord(string key)
{
return MyWord.Remove(key);
}
public void addWord(string key)
{
if (MyWord.ContainsKey(key))
{
return;
}
else
{
MyWord.Add(key, new Word(key));
}
}
public void addWord(Word word)
{
if (MyWord.ContainsKey(word.Key))
{
return;
}
else
{
MyWord.Add(word.Key, word);
}
}
public Word getWordByKey(string key)
{
if (MyWord.ContainsKey(key))
{
return MyWord[key];
}
else
{
return null;
}
}
public Dictionary<string, Word>.KeyCollection getAllWordKeys()
{
return MyWord.Keys;
}
}
class IOControl
{
public static string ReadFileUsingDefault(string FileName)
{
StreamReader sr = new StreamReader(FileName, Encoding.Default);
string temp = sr.ReadToEnd();
sr.Close();
return temp;
}
}
public class LoadDict
{
private Dictionary<string, string> _sogou = new Dictionary<string, string>();
private Dictionary<string, string> _sogou1 = new Dictionary<string, string>();
private static bool instance_flag = false;
///加载词库
private LoadDict()
{
StreamReader reader = new StreamReader(Environment.CurrentDirectory+"\\data\\SogouLabDic.dic", Encoding.Default);
string text = null;
while ((text = reader.ReadLine()) != null)
{
char[] separator = new char[] { '\t' };
string[] textArray = text.Split(separator);
this.sogou.Add(textArray[0], textArray[1]);
this.sogou1.Add(textArray[0], textArray[2]);
}
}
public static LoadDict getDict()
{
if (!instance_flag)
return new LoadDict();
else
return null;
}
public Dictionary<string, string> sogou
{
get
{
return this._sogou;
}
set
{
this._sogou = this.sogou;
}
}
public Dictionary<string, string> sogou1
{
get
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -