⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 crawlsample.cs

📁 用C#编写的一个款搜索engine的源代码!摘自<Visual c#2005 程序设计>
💻 CS
字号:
using System;
using System.IO;
using System.Text;
using System.Collections;

namespace SECompare.Kernel
{
	/// <summary>
	/// List of query samples to crawl.
	/// </summary>
	public class CrawlSample
	{
		private ArrayList mSamples;   //An array list of CrawlSampleUnit
		private String mCategory;

		public int Count
		{
			get
			{
				return this.mSamples.Count;
			}
		}

		public CrawlSample()
		{
			this.mSamples = new ArrayList();
		}

		/// <summary>
		/// Load Query samples of a certain category from given file.
		/// File format is like: 
		/// ******************************
		///   words         frequency
		/// ******************************
		///		cnn	8700552
		///		cnn.com	2554240
		///		cnn news	786853
		///		www.cnn.com	755488
		///		news	184851
		///	*******************************
		/// </summary>
		/// <param name="category"></param>
		/// <param name="filename"></param>
		public void Load(String category,String filename)
		{
			this.mCategory = category;
			//firstly, clear former data
			this.mSamples.Clear();
			//then, read data file line by line
			StreamReader reader = new StreamReader(filename);
			String line;
			line = reader.ReadLine();
			while(line!=null)
			{
				//parse the line to get a string(words) and a integer(frequency)
				String[] splits = line.Split(new char[]{'\t'});
				//get frequency string
				int frequency=0;
				int end;  //end of words, and start of frequency
				for(end=splits.Length-1;end>0;end--)
				{
					try
					{
						frequency = Convert.ToInt32(splits[end]); 
						break;
					}
					catch(FormatException)
					{
						continue;
					}
				}
				StringBuilder words = new StringBuilder(splits[0]);
				for(int i=1;i<end;i++)
				{
					words.Append(splits[i]);
				}

				this.mSamples.Add(new CrawlSampleUnit(words.ToString(),frequency));
	
				//read next line until read the end
				line = reader.ReadLine();
			}
			reader.Close();
		}


		/// <summary>
		/// Check if the Words has been contained in the Sample List.
		/// </summary>
		/// <param name="Words">The Words String to check.</param>
		/// <returns>Return ture if contains, else false.</returns>
		public bool ContainsWords(String Words)
		{
			for(int i=0;i<this.mSamples.Count;i++)
			{
				CrawlSampleUnit u = (CrawlSampleUnit)this.mSamples[i];
				if(u.Words.Equals(Words))
					return true;

			}
			return false;
		}

		/// <summary>
		/// Save query samples to file. The file will be overwritten.
		/// </summary>
		/// <param name="filename">File path</param>
		public void Save(String filename)
		{
			FileInfo file = new FileInfo(filename);
			if(!file.Exists)
			{
				FileStream fs = file.Create();
				fs.Close();
			}
			
			StreamWriter w = new StreamWriter(file.FullName,false);
			foreach(CrawlSampleUnit unit in this.mSamples)
			{
				w.WriteLine(unit.ToString());
			}
			w.Close();
		}

		/// <summary>
		/// Remove same samples. 
		/// Leave a given [percentage] samples remain, which include samples of high, middle and low frequency.
		/// </summary>
		/// <param name="percentage">a float between 0 and 1, else do nothing</param>
		public void Prune(float percentage)
		{
			if(percentage>1||percentage<0) return;
			if(this.mSamples.Count==0) return;
			this.Prune((int)(this.mSamples.Count*percentage));
		}

		/// <summary>
		/// Remove same samples. 
		/// Leave a given [sampleSize] samples remain, which include samples of high, middle and low frequency.
		/// </summary>
		/// <param name="sampleSize">an integer, which represent the sample size after pruning.If it is larger than original size, do nothing.</param>
		public void Prune(int sampleSize)
		{
			if(this.mSamples.Count==0||this.mSamples.Count<sampleSize) return;
			int interval = this.mSamples.Count/sampleSize;
			if(interval<=1) return;

			//pruning
			int index=0;//original index in samples
			for(int i=0;i<this.mSamples.Count;i++,index++)
			{
				if(this.mSamples.Count==sampleSize) break;
				if(index%interval!=0)
				{
					this.mSamples.RemoveAt(i);
					i--;
				}
			}

			if(this.mSamples.Count>sampleSize)
			{
				this.mSamples.RemoveRange(sampleSize,this.mSamples.Count-sampleSize);
			}
		}

		public ArrayList GetSamples()
		{
			return this.mSamples;
		}

		public String[] GetQueries()
		{
			String[] ret = new String[this.mSamples.Count];
			int i=0;
			foreach(CrawlSampleUnit unit in this.mSamples)
			{
				ret[i] = unit.Words;
				i++;
			}
			return ret;
		}

	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -