📄 wordlib.cs
字号:
}//foreach dt_cate循环结束
#region 统一文档频数df
DataTable dt_termLib = new DataTable();
dt_termLib = new Njnu.DAL.TermLib().GetTermLib(); //获取TermLib表
foreach(DataRow row3 in dt_termLib.Rows)
{
int df = 0;
foreach(DataRow row4 in dt_termLib.Rows)
{
if( row3["word"].ToString().Equals( row4["word"].ToString() ))
{
df += Convert.ToInt32( row4["docFrequence"] );
}
}
//更新表TermLib
string str = "update TermLib set docFrequence = " + df + " where word = '" + row3["word"].ToString() +"'";
SqlParameter[] parameters = {};
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
sqlDB.Run();
}
}//foreach
#endregion
}
private DataTable GetTempTermLib(string cateID)
{
DataTable dt = new DataTable();
SqlParameter[] parameters = {};
string str = "select * from TermLib_Temp where cateID ='" + cateID + "'";
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
sqlDB.Run(dt);
}
return dt;
}
#endregion
#region 计算测试文档的相似度值
public string CalculateSimValue(DataTable dt)
{
double upValue = 0; //表示分子值
double downValue1 = 0; //表示分母第一部分值
double downValue2 = 0; //表示分母第二部分值
double simValue = 0; //表示相似度值
double simValue_temp = 0;
string testCateID = null; //判定的类别号
string testInfor = null;
testInfor += "文档号:" + dt.Rows[0]["docID"].ToString() + " 原来类别:" + new Category().GetCateNameByCateID( dt.Rows[0]["cateID"].ToString() ) + " " + dt.Rows[0]["cateID"].ToString();
testInfor += "\r\n文档名:" + new DocLib().GetDocName( dt.Rows[0]["docID"].ToString() );
testInfor += "\r\n测试结果:";
DataTable dt_cateLib = new DataTable();
dt_cateLib = new TermLib().GetCateID();
foreach(DataRow row_cate in dt_cateLib.Rows) //循环各个类别
{
DataTable dt_termLib = new DataTable();
dt_termLib = new TermLib().GetTermLibByCateID(row_cate["cateID"].ToString());
foreach(DataRow row in dt.Rows)
{
string word1 = row["word"].ToString();
double weight1 = Convert.ToDouble( row["weight"] );
foreach(DataRow row2 in dt_termLib.Rows)
{
string word2 = row2["word"].ToString();
double weight2 = Convert.ToDouble( row2["weight"] );
if(word1.Equals(word2))
{
upValue += weight1 * weight2; //计算分子值
downValue1 += weight1 * weight1; //计算分母第一部分值
}
}//内层foreach语句结束
}//次外层foreach语句结束
foreach(DataRow row3 in dt_termLib.Rows)
{
downValue2 += Convert.ToDouble( row3["weight"] ) * Convert.ToDouble( row3["weight"] ); //计算分母第二部分值
}
simValue_temp = upValue / (Math.Sqrt(downValue1 * downValue2)); //计算相似度值
testInfor += "\r\n\t" + new Category().GetCateNameByCateID( row_cate["cateID"].ToString() ) + "\t SimValue:" + simValue_temp;
if(simValue < simValue_temp)
{
simValue = simValue_temp;
testCateID = row_cate["cateID"].ToString();
}
upValue = 0;
downValue1 = 0;
downValue2 = 0; //临时值重新归零
}//外层foreach语句结束
testInfor += "\r\n----------\r\n===>>> 判定类别:" + new Category().GetCateNameByCateID( testCateID ) + "" + testCateID + "\r\n\r\n-----------------------------------------------------------------------------------\r\n\r\n";
//更新表DocInforLib_test
string str = "update DocInforLib_test set testCateID = '" + testCateID + "', simValue = " + simValue + " where docID = '" + dt.Rows[0]["docID"].ToString() +"'";
SqlParameter[] parameters = {};
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
sqlDB.Run();
}
return testInfor;
}
#endregion
#region 去词降维
public int DeleteSigleWord(string tableName)
{
string str = "delete from " + tableName + " where len(word) = 1";
SqlParameter[] parameters = {};
int effortCount = 0;
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
effortCount = sqlDB.Run();
}
return effortCount;
}
public int DeleteWordByTF(int termFrequence, string tableName)
{
string str = "delete from " + tableName + " where termFrequence < " + termFrequence;
SqlParameter[] parameters = {};
int effortCount = 0;
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
effortCount = sqlDB.Run();
}
return effortCount;
}
public int DeleteWordByDF(int docFrequence)
{
string str = "delete from TermLib_temp where docFrequence < " + docFrequence;
SqlParameter[] parameters = {};
int effortCount = 0;
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
effortCount = sqlDB.Run();
}
return effortCount;
}
#region 去除噪音词条
public int DeleteRareWord(string tableName)
{
int effortCount=0;
string str = null;
SqlParameter[] parameters = {};
//去除省、市、县名称词条
string str1= "省/市/县/们";
string[] str2 = str1.Split('/');
for(int i =0; i<str2.Length; i++)
{
string strTemp = str2[i].ToString();
str = "delete from " + tableName + " where word like '%"+ strTemp +"'";
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
effortCount += sqlDB.Run();
}
}
//去除含有·词条
str = "delete from " + tableName + " where word like '%·%'";
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
effortCount += sqlDB.Run();
}
//去除特定的中学名称词条
str = "delete from " + tableName + " where word like '%中学' and len(word)>2";
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
effortCount += sqlDB.Run();
}
//去除特定的"一"字结尾的3字词
str = "delete from " + tableName + " where word like '%一' and len(word)!=2";
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
effortCount += sqlDB.Run();
}
//去除特定的"第"字开头的2字词
str = "delete from " + tableName + " where word like '第%' and len(word)=2";
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
effortCount += sqlDB.Run();
}
//去除虚词词条Part1
string str3= "多/有/以/此/为/是/也/么";
string[] str4 = str3.Split('/');
for(int i =0; i<str4.Length; i++)
{
string strTemp = str4[i].ToString();
str = "delete from " + tableName + " where word like '%"+ strTemp +"%' and len(word)<4";
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
effortCount += sqlDB.Run();
}
}
//去除虚词词条Part2
string str5= "然/而/于/乎";
string[] str6 = str5.Split('/');
for(int i =0; i<str6.Length; i++)
{
string strTemp = str6[i].ToString();
str = "delete from " + tableName + " where word like '%"+ strTemp +"%' and len(word)<3";
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
effortCount += sqlDB.Run();
}
}
//去除虚词词条Part3
string str7= "的/之";
string[] str8 = str7.Split('/');
for(int i =0; i<str8.Length; i++)
{
string strTemp = str8[i].ToString();
str = "delete from " + tableName + " where word like '%"+ strTemp +"' and len(word)<4";
using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
{
effortCount += sqlDB.Run();
}
}
return effortCount;
}
#endregion
#endregion
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -