⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 wordlib.cs

📁 实现文本的特征提取
💻 CS
📖 第 1 页 / 共 2 页
字号:
using System;
using System.Data;
using System.Data.SqlClient;
using System.IO;

namespace Njnu.DAL
{
	/// <summary>
	/// WordLib 的摘要说明。
	/// </summary>
	public class WordLib
	{			
		#region 将某个文档信息添加至表WordLib
		
		public void AddWord(Common.Doc doc, string tableName)
		{
			WordSegmentor.WordSegmentor MySegment1=new WordSegmentor.WordSegmentor(false);
				
			string str_word = null;
			str_word = MySegment1.GetString(new DAL.Doc().GetBody(doc.Path));
			string[] words = str_word.Split('/');			

			#region 下面进行单篇文档词频统计
			
			int dif_wordNum = 0;//不同关键字的个数

			string[] word_Num = new string[words.Length];//开辟缓存,暂时存放数据
			int[] word_Count = new int[words.Length ];

			int i,j;				
			for(i=0; i<words.Length; i++)
			{      
				for(j=0; j<=dif_wordNum; j++)
				{
					if(words.GetValue(i).Equals(word_Num[j]))
					{
						word_Count[j]++;
						break;
					}
				}
				if (j==dif_wordNum+1 )
				{
					dif_wordNum++;
					word_Num[dif_wordNum] = words.GetValue(i).ToString();
					word_Count[dif_wordNum] = 1;
				}
			}//for语句结束
			
			#endregion

			try
			{
				//插入表tableName
				for(i=1;i<dif_wordNum;i++)
				{
					string str = "insert " + tableName + "(word, cateID, docID, termFrequence) values('" + word_Num[i] + "','" + doc.CateID + "','"  + doc.DocID + "'," + word_Count[i] + ")";							
					SqlParameter[] parameters = {};

					using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
					{
						sqlDB.Run();
					}
				}//for语句结束
			}
			catch(Exception e)
			{
				throw( new Exception(" 单词 ["+ word_Num[i] +"] 有误,请检查!" + e.Message));
			} 
		}	
	
		#endregion
		

		#region 计算tfidt权重

		public void CreateTfidfLib()
		{
			
			DataTable dt_word = new DataTable();
			dt_word = GetWordLib("WordLib");	

			foreach(DataRow row in dt_word.Rows)
			{
				
				double weightValue = 0;
				string word = row["word"].ToString();	

				int df = 0;  //表示文档频数
				try
				{	
					foreach(DataRow row2 in dt_word.Rows)
					{
						string word2 = row2["word"].ToString();	
						if(word == word2)
							df++;
					}

					weightValue = (int)row["termFrequence"] * Math.Log(  new Njnu.DAL.DocLib().GetDocCount()/df + 0.01 );
											
					//插入表TfidfLib
					string str = "insert TfidfLib(word, cateID, weight,termFrequence, docFrequence) values('" + word + "','" + row["CateID"].ToString() + "',"  + weightValue+ "," + Convert.ToInt32( row["termFrequence"] ) + "," + df + ")";							
					SqlParameter[] parameters = {};

					using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
					{
						sqlDB.Run();
					}	
				}
				catch(Exception e)
				{
					throw( new Exception("单词("+ word +")发生错误!" + e.Message));
				}
			}//foreach语句结束
		}		


		public void CreateTfidfLib_test()
		{			
			DataTable dt_testWord = new DataTable();
			dt_testWord = GetWordLib("WordLib_test");

			DataTable dt_word = new DataTable();
			dt_word = GetWordLib("WordLib");
			
			foreach(DataRow row in dt_testWord.Rows)
			{			
				double weightValue = 0;
				string word = row["word"].ToString();	

				int df = 0;  //表示文档频数
				try
				{	
					foreach(DataRow row2 in dt_word.Rows)
					{
						string word2 = row2["word"].ToString();	
						if(word == word2)
							df++;
					}

					weightValue = (int)row["termFrequence"] * Math.Log(  new Njnu.DAL.DocLib().GetDocCount() / (df+1)  + 0.01 );
											
					//插入表TfidfLib
					string str = "insert TfidfLib_test(word, cateID, docID, weight,termFrequence, docFrequence) values('" + word + "','" + row["cateID"].ToString() + "','"+ row["docID"].ToString() + "',"  + weightValue+ "," + Convert.ToInt32( row["termFrequence"] ) + "," + df + ")";							
					SqlParameter[] parameters = {};

					using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
					{
						sqlDB.Run();
					}	
				}
				catch(Exception e)
				{
					throw( new Exception("单词("+ word +")发生错误!" + e.Message));
				}
			}//foreach语句结束
		}	


		private DataTable GetWordLib(string tableName)
		{
			DataTable dt = new DataTable();
			SqlParameter[] parameters = {};
			
			string str = "select * from " + tableName;
			using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
			{
				sqlDB.Run(dt);
			}
			return dt;
		}
		
		
		#endregion


		#region 计算每类的特征项

		public void CreateTempTermLib()
		{
			DataTable dt_cate = new DataTable();
			dt_cate = new Njnu.DAL.Category().GetAllCate();

			foreach(DataRow row in dt_cate.Rows)
			{
				//对每一类进行特征项归并
				CreateTempTermLibByCateID( row["ID"].ToString(), new Njnu.DAL.DocLib().GetDocCountByCateID(row["ID"].ToString()) );
			}
		}

		//表TermLib_temp中各类别里的word不重复
		private void CreateTempTermLibByCateID(string cateID, int cateCount)
		{
			DataTable dt = new DataTable();
			dt = new TfidfLib().GetDistinctTfidfLib(cateID, "TfidfLib");	//	获取表tfidf,将重复词归并为1个

			DataTable dt_tfidf = new DataTable();
			dt_tfidf = new TfidfLib().GetTfidfLib(cateID, "TfidfLib");

			foreach(DataRow row in dt.Rows)
			{
				int df = 0;
				double weight = 0;		
				string word1 = row["word"].ToString();

				foreach(DataRow row2 in dt_tfidf.Rows)
				{						
					string word2 = row2["word"].ToString();
					double weight2 = Convert.ToDouble( row2["weight"] );

					if(word1.Equals(word2))
					{
						df++;							
						weight += weight2;
					}						
				}//内层foreach语句结束	

				//插入表TermLib_temp
				string str = "insert TermLib_temp(word, cateID, weight, docFrequence) values('" + word1 + "','" + row["cateID"].ToString() + "',"  + weight/cateCount + "," + df + ")";							
				SqlParameter[] parameters = {};

				using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
				{
					sqlDB.Run();
				}	

			}//外层foreach语句结束
		}


		//下面进行归一化处理
		public void CreateTermLib()
		{
			DataTable dt_cate = new DataTable();
			dt_cate = new Njnu.DAL.Category().GetAllCate();

			foreach(DataRow row in dt_cate.Rows)  //对每一类进行特征项归并
			{				
				DataTable dt = new DataTable();
				dt = GetTempTermLib( row["ID"].ToString() );	//取得每一类特征库

				double downValue = 0;  //代表分母值
				double upValue = 0;		 //代表分子值

				foreach(DataRow row1 in dt.Rows)
				{						
					double weightValue = Convert.ToDouble( row1["Weight"] );				
					downValue = downValue + weightValue * weightValue;			//计算分母值	
				}						

				foreach(DataRow row2 in dt.Rows)
				{				
					upValue = Convert.ToDouble( row2["Weight"] );
					double weightValue = upValue / ( Math.Sqrt(downValue) );					

					//插入表TermLib
					string str = "insert TermLib(word, cateID, weight, docFrequence) values('" + row2["word"].ToString()  + "','" + row2["cateID"].ToString() + "',"  + weightValue + "," +   Convert.ToInt32( row2["docFrequence"] ) + ")";							
					SqlParameter[] parameters = {};

					using(SqlDatabase sqlDB = new SqlDatabase(str, parameters))
					{
						sqlDB.Run();
					}	
				}//foreach

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -