⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 wordlib.cs

📁 实现文本的特征提取
💻 CS
📖 第 1 页 / 共 2 页
字号:
			}//foreach dt_cate循环结束	
		
			#region 统一文档频数df

			DataTable dt_termLib = new DataTable();
			dt_termLib = new Njnu.DAL.TermLib().GetTermLib(); //获取TermLib表

			foreach(DataRow row3 in dt_termLib.Rows)
			{							
				int df = 0;
				foreach(DataRow row4 in dt_termLib.Rows)
				{
					if( row3["word"].ToString().Equals( row4["word"].ToString() ))
					{														
						df += Convert.ToInt32( row4["docFrequence"] );	
					}			
				}

				//更新表TermLib
				string str = "update TermLib set docFrequence = " + df + " where word = '" + row3["word"].ToString() +"'";							
				SqlParameter[] parameters = {};

				using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
				{
					sqlDB.Run();
				}	
			}//foreach

			#endregion
		}

		private DataTable GetTempTermLib(string cateID)
		{
			DataTable dt = new DataTable();
			SqlParameter[] parameters = {};
			
			string str = "select * from TermLib_Temp where cateID ='" + cateID + "'";
			using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
			{
				sqlDB.Run(dt);
			}
			return dt;
		}		

		#endregion

		#region 计算测试文档的相似度值
		public string CalculateSimValue(DataTable dt)
		{			
            double upValue = 0;		//表示分子值
            double downValue1 = 0; //表示分母第一部分值
			double downValue2 = 0; //表示分母第二部分值
			double simValue = 0;   //表示相似度值			
			double simValue_temp = 0;
			string testCateID = null;  //判定的类别号
			string testInfor = null;		
	
			testInfor += "文档号:" + dt.Rows[0]["docID"].ToString() + "   原来类别:" + new Category().GetCateNameByCateID( dt.Rows[0]["cateID"].ToString() ) + " " + dt.Rows[0]["cateID"].ToString();
			testInfor += "\r\n文档名:" + new DocLib().GetDocName( dt.Rows[0]["docID"].ToString() );
			testInfor += "\r\n测试结果:";
			DataTable dt_cateLib = new DataTable();
			dt_cateLib = new TermLib().GetCateID();

			foreach(DataRow row_cate in dt_cateLib.Rows)	//循环各个类别
			{
				DataTable dt_termLib = new DataTable();
				dt_termLib = new TermLib().GetTermLibByCateID(row_cate["cateID"].ToString());				

				foreach(DataRow row in dt.Rows)   
				{
					string word1 = row["word"].ToString();
					double weight1 = Convert.ToDouble( row["weight"] );

					foreach(DataRow row2 in dt_termLib.Rows)
					{						
						string word2 = row2["word"].ToString();	
						double weight2 = Convert.ToDouble( row2["weight"] );
						if(word1.Equals(word2))
						{														
							upValue += weight1 * weight2;      //计算分子值
							downValue1 += weight1 * weight1;   //计算分母第一部分值
						}								
					}//内层foreach语句结束	
					
				}//次外层foreach语句结束

				foreach(DataRow row3 in dt_termLib.Rows)
				{
					downValue2 +=  Convert.ToDouble( row3["weight"] ) *  Convert.ToDouble( row3["weight"] );   //计算分母第二部分值
				}
			
				simValue_temp = upValue / (Math.Sqrt(downValue1 * downValue2));   //计算相似度值	

				testInfor += "\r\n\t" + new Category().GetCateNameByCateID( row_cate["cateID"].ToString() ) + "\t SimValue:" + simValue_temp;
				if(simValue < simValue_temp)
				{
					simValue = simValue_temp;
					testCateID = row_cate["cateID"].ToString();
				}					
				upValue = 0;
				downValue1 = 0;
				downValue2 = 0;  //临时值重新归零
				
			}//外层foreach语句结束		
			
			testInfor += "\r\n----------\r\n===>>> 判定类别:" + new Category().GetCateNameByCateID( testCateID ) + "" + testCateID +  "\r\n\r\n-----------------------------------------------------------------------------------\r\n\r\n";


			//更新表DocInforLib_test			
			string str = "update DocInforLib_test set testCateID = '" + testCateID + "', simValue = " + simValue + " where docID = '" + dt.Rows[0]["docID"].ToString() +"'";
			SqlParameter[] parameters = {};

			using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
			{
				sqlDB.Run();
			}	

			return testInfor;
            
		}
		#endregion

		#region 去词降维

		public int DeleteSigleWord(string tableName)
		{			
			string str = "delete from " + tableName + " where len(word) = 1";
			SqlParameter[] parameters = {};
			int effortCount = 0;

			using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
			{
				effortCount = sqlDB.Run();
			}			
			return effortCount;
		}
			
	
		public int DeleteWordByTF(int termFrequence, string tableName)
		{
			string str = "delete from " + tableName + " where termFrequence < " + termFrequence;
			SqlParameter[] parameters = {};
			int effortCount = 0;

			using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
			{
				effortCount = sqlDB.Run();
			}			
			return effortCount;
		}

		public int DeleteWordByDF(int docFrequence)
		{
			string str = "delete from TermLib_temp where docFrequence < " + docFrequence;
			SqlParameter[] parameters = {};
			int effortCount = 0;

			using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
			{
				effortCount = sqlDB.Run();
			}			
			return effortCount;
		}

		#region 去除噪音词条
		public int DeleteRareWord(string tableName)
		{
			
			int effortCount=0;	
			string str = null;
			SqlParameter[] parameters = {};

			//去除省、市、县名称词条
			string str1= "省/市/县/们";
			string[] str2 = str1.Split('/');		
			for(int i =0; i<str2.Length; i++)
			{
				string strTemp = str2[i].ToString();
				str = "delete from " + tableName + " where word like '%"+ strTemp +"'";

				using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
				{
					effortCount += sqlDB.Run();
				}				
			}

			//去除含有·词条
			str = "delete from " + tableName + " where word like '%·%'";
			using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
			{
				effortCount += sqlDB.Run();
			}
			
			
			//去除特定的中学名称词条
			str = "delete from " + tableName + " where word like '%中学'  and len(word)>2";
			using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
			{
				effortCount += sqlDB.Run();
			}

			//去除特定的"一"字结尾的3字词
			str = "delete from " + tableName + " where word like '%一'  and len(word)!=2";
			using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
			{
				effortCount += sqlDB.Run();
			}

			//去除特定的"第"字开头的2字词
			str = "delete from " + tableName + " where word like '第%' and len(word)=2";
			using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
			{
				effortCount += sqlDB.Run();
			}

			//去除虚词词条Part1  
			string str3= "多/有/以/此/为/是/也/么";
			string[] str4 = str3.Split('/');

			for(int i =0; i<str4.Length; i++)
			{
				string strTemp = str4[i].ToString();
				str = "delete from " + tableName + " where word like '%"+ strTemp +"%' and len(word)<4";
				using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
				{
					effortCount += sqlDB.Run();
				}
			}

			//去除虚词词条Part2
			string str5= "然/而/于/乎";
			string[] str6 = str5.Split('/');

			for(int i =0; i<str6.Length; i++)
			{
				string strTemp = str6[i].ToString();
				str = "delete from " + tableName + " where word like '%"+ strTemp +"%' and len(word)<3";
				using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
				{
					effortCount += sqlDB.Run();
				}
			}

			//去除虚词词条Part3
			string str7= "的/之";
			string[] str8 = str7.Split('/');

			for(int i =0; i<str8.Length; i++)
			{
				string strTemp = str8[i].ToString();
				str = "delete from " + tableName + " where word like '%"+ strTemp +"' and len(word)<4";
				using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
				{
					effortCount += sqlDB.Run();
				}
			}
			return effortCount;
		}
		#endregion

		#endregion
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -