⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlcrawler.cs

📁 用C#编写的一个款搜索engine的源代码!摘自<Visual c#2005 程序设计>
💻 CS
字号:
using System;
using System.IO;
using System.Threading;
using SECompare.Config;
using SECompare.SearchAPI;
using SECompare.Structure;

namespace SECompare.Kernel
{
	/// <summary>
	/// HTMLCrawler.
	/// </summary>
	public class HTMLCrawler:ThreadStatus
	{
		public HTMLCrawler()
		{
			this.threadStart = new ThreadStart(Crawl);
		}
	
		private void Crawl()
		{
			try
			{
				//Get all samplinged queries list files
				String[] files = Directory.GetFiles((new Config.Config()).SamplingedQueryRoot);
				for(int i=0;i<files.Length;i++)
				{
					//************Progress Status Report***********
					this.mFilePercentage = (float)i/files.Length; 
					//*********************************************
					this.Crawl(files[i]);
				}
				
			}
			catch(ThreadInterruptedException)
			{
				
			}
			finally
			{
				this.mIsFinished = true;
			}
		}

		/// <summary>
		/// Crawl a sample query file.
		/// </summary>
		/// <param name="filename">File name</param>
		private void Crawl(String filename)
		{
			FileInfo file = new FileInfo(filename);
				
			//read query samples
			String category = file.Name;
			CrawlSample sample = new CrawlSample();
			sample.Load(category,filename);

			//get query strings
			String[] queries = sample.GetQueries();
			//get all selected engines
			String[] engines = this.mEngineSetting.SelectedEngines;
	
			//For each query
			for(int i=0;i<queries.Length;i++)
			{
				String query = queries[i];

				//************Progress Status Report***********
				this.mProcessingFile  = category;
				this.mEntryPercentage = (float)(i+1)/queries.Length;  
				//*********************************************		
	
				//For each Rank
				for(int rank=1;rank<=1000;rank++)
				{
					//************Progress Status Report***********
					this.mProcessingEntry = query + "(Rank:" + rank +")";  
					//*********************************************

					//for each selected engines, crawl HTML
					foreach(String engine in engines)
						this.CrawlPage(query,engine,rank);
				}

			}

		}

		private bool CrawlPage(String Query,String Engine,int Rank)
		{
			try
			{
				QueryRecord record = QueryRecord.Load(Query,Engine,Rank);
				//If record does not exist
				if(record==null)
					return false;
				//If HTM has been crawled
				if(QueryRecord.ExistsHTML(Query,Engine,Rank))
					return false;

				String HTML=null;
				if(SinglePage.SinglePage.Download(record.URL,ref HTML))
				{
					record.SaveHTML(HTML);
                    Config.Log.Append("HTML Saved:" + record.URL);
					return true;
				}
				else
				{
					return false;
				}
			}
			catch(System.Net.WebException ex)
			{
				Config.Log.Append("HTML DownLoad Error:"+ex.Message);
				return false;
			}
		
		}

	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -