📄 wordlib.cs
字号:
using System;
using System.Data;
using System.Data.SqlClient;
using System.IO;
namespace Njnu.DAL
{
/// <summary>
/// WordLib 的摘要说明。
/// </summary>
public class WordLib
{
#region 将某个文档信息添加至表WordLib
public void AddWord(Common.Doc doc, string tableName)
{
WordSegmentor.WordSegmentor MySegment1=new WordSegmentor.WordSegmentor(false);
string str_word = null;
str_word = MySegment1.GetString(new DAL.Doc().GetBody(doc.Path));
string[] words = str_word.Split('/');
#region 下面进行单篇文档词频统计
int dif_wordNum = 0;//不同关键字的个数
string[] word_Num = new string[words.Length];//开辟缓存,暂时存放数据
int[] word_Count = new int[words.Length ];
int i,j;
for(i=0; i<words.Length; i++)
{
for(j=0; j<=dif_wordNum; j++)
{
if(words.GetValue(i).Equals(word_Num[j]))
{
word_Count[j]++;
break;
}
}
if (j==dif_wordNum+1 )
{
dif_wordNum++;
word_Num[dif_wordNum] = words.GetValue(i).ToString();
word_Count[dif_wordNum] = 1;
}
}//for语句结束
#endregion
try
{
//插入表tableName
for(i=1;i<dif_wordNum;i++)
{
string str = "insert " + tableName + "(word, cateID, docID, termFrequence) values('" + word_Num[i] + "','" + doc.CateID + "','" + doc.DocID + "'," + word_Count[i] + ")";
SqlParameter[] parameters = {};
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
sqlDB.Run();
}
}//for语句结束
}
catch(Exception e)
{
throw( new Exception(" 单词 ["+ word_Num[i] +"] 有误,请检查!" + e.Message));
}
}
#endregion
#region 计算tfidt权重
public void CreateTfidfLib()
{
DataTable dt_word = new DataTable();
dt_word = GetWordLib("WordLib");
foreach(DataRow row in dt_word.Rows)
{
double weightValue = 0;
string word = row["word"].ToString();
int df = 0; //表示文档频数
try
{
foreach(DataRow row2 in dt_word.Rows)
{
string word2 = row2["word"].ToString();
if(word == word2)
df++;
}
weightValue = (int)row["termFrequence"] * Math.Log( new Njnu.DAL.DocLib().GetDocCount()/df + 0.01 );
//插入表TfidfLib
string str = "insert TfidfLib(word, cateID, weight,termFrequence, docFrequence) values('" + word + "','" + row["CateID"].ToString() + "'," + weightValue+ "," + Convert.ToInt32( row["termFrequence"] ) + "," + df + ")";
SqlParameter[] parameters = {};
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
sqlDB.Run();
}
}
catch(Exception e)
{
throw( new Exception("单词("+ word +")发生错误!" + e.Message));
}
}//foreach语句结束
}
public void CreateTfidfLib_test()
{
DataTable dt_testWord = new DataTable();
dt_testWord = GetWordLib("WordLib_test");
DataTable dt_word = new DataTable();
dt_word = GetWordLib("WordLib");
foreach(DataRow row in dt_testWord.Rows)
{
double weightValue = 0;
string word = row["word"].ToString();
int df = 0; //表示文档频数
try
{
foreach(DataRow row2 in dt_word.Rows)
{
string word2 = row2["word"].ToString();
if(word == word2)
df++;
}
weightValue = (int)row["termFrequence"] * Math.Log( new Njnu.DAL.DocLib().GetDocCount() / (df+1) + 0.01 );
//插入表TfidfLib
string str = "insert TfidfLib_test(word, cateID, docID, weight,termFrequence, docFrequence) values('" + word + "','" + row["cateID"].ToString() + "','"+ row["docID"].ToString() + "'," + weightValue+ "," + Convert.ToInt32( row["termFrequence"] ) + "," + df + ")";
SqlParameter[] parameters = {};
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
sqlDB.Run();
}
}
catch(Exception e)
{
throw( new Exception("单词("+ word +")发生错误!" + e.Message));
}
}//foreach语句结束
}
private DataTable GetWordLib(string tableName)
{
DataTable dt = new DataTable();
SqlParameter[] parameters = {};
string str = "select * from " + tableName;
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
sqlDB.Run(dt);
}
return dt;
}
#endregion
#region 计算每类的特征项
public void CreateTempTermLib()
{
DataTable dt_cate = new DataTable();
dt_cate = new Njnu.DAL.Category().GetAllCate();
foreach(DataRow row in dt_cate.Rows)
{
//对每一类进行特征项归并
CreateTempTermLibByCateID( row["ID"].ToString(), new Njnu.DAL.DocLib().GetDocCountByCateID(row["ID"].ToString()) );
}
}
//表TermLib_temp中各类别里的word不重复
private void CreateTempTermLibByCateID(string cateID, int cateCount)
{
DataTable dt = new DataTable();
dt = new TfidfLib().GetDistinctTfidfLib(cateID, "TfidfLib"); // 获取表tfidf,将重复词归并为1个
DataTable dt_tfidf = new DataTable();
dt_tfidf = new TfidfLib().GetTfidfLib(cateID, "TfidfLib");
foreach(DataRow row in dt.Rows)
{
int df = 0;
double weight = 0;
string word1 = row["word"].ToString();
foreach(DataRow row2 in dt_tfidf.Rows)
{
string word2 = row2["word"].ToString();
double weight2 = Convert.ToDouble( row2["weight"] );
if(word1.Equals(word2))
{
df++;
weight += weight2;
}
}//内层foreach语句结束
//插入表TermLib_temp
string str = "insert TermLib_temp(word, cateID, weight, docFrequence) values('" + word1 + "','" + row["cateID"].ToString() + "'," + weight/cateCount + "," + df + ")";
SqlParameter[] parameters = {};
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
sqlDB.Run();
}
}//外层foreach语句结束
}
//下面进行归一化处理
public void CreateTermLib()
{
DataTable dt_cate = new DataTable();
dt_cate = new Njnu.DAL.Category().GetAllCate();
foreach(DataRow row in dt_cate.Rows) //对每一类进行特征项归并
{
DataTable dt = new DataTable();
dt = GetTempTermLib( row["ID"].ToString() ); //取得每一类特征库
double downValue = 0; //代表分母值
double upValue = 0; //代表分子值
foreach(DataRow row1 in dt.Rows)
{
double weightValue = Convert.ToDouble( row1["Weight"] );
downValue = downValue + weightValue * weightValue; //计算分母值
}
foreach(DataRow row2 in dt.Rows)
{
upValue = Convert.ToDouble( row2["Weight"] );
double weightValue = upValue / ( Math.Sqrt(downValue) );
//插入表TermLib
string str = "insert TermLib(word, cateID, weight, docFrequence) values('" + row2["word"].ToString() + "','" + row2["cateID"].ToString() + "'," + weightValue + "," + Convert.ToInt32( row2["docFrequence"] ) + ")";
SqlParameter[] parameters = {};
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
sqlDB.Run();
}
}//foreach
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -