spider.cs

来自「小型搜索软件的源代码」· CS 代码 · 共 300 行

300 行

using System;
using System.Collections;
using System.Net;
using System.IO;
using System.Threading;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using ShootSearch.Logging;

namespace ShootSearch.Spiders.http
{
	/// <summary>
	/// The main class for the spider. This spider can be used with the 
	/// SpiderForm form that has been provided. The spider is completely 
	/// selfcontained. If you would like to use the spider with your own
	/// application just remove the references to m_spiderForm from this file.
	/// 
	/// The files needed for the spider are:
	/// 
	/// Attribute.cs - Used by the HTML parser
	/// AttributeList.cs - Used by the HTML parser
	/// DocumentWorker - Used to "thread" the spider
	/// Done.cs - Allows the spider to know when it is done
	/// Parse.cs - Used by the HTML parser
	/// ParseHTML.cs - The HTML parser
	/// Spider.cs - This file
	/// SpiderForm.cs - Demo of how to use the spider
	/// 
	/// This spider is copyright 2003 by Jeff Heaton. However, it is
	/// released under a Limited GNU Public License (LGPL). You may 
	/// use it freely in your own programs. For the latest version visit
	/// http://www.jeffheaton.com.
	///
	/// </summary>
	public class Spider : SpiderMan
	{
		/// <summary>
		/// The URL's that have already been processed.
		/// </summary>
		private Hashtable m_already;

		/// <summary>
		/// URL's that are waiting to be processed.
		/// </summary>
		private Queue m_workload;

		/// <summary>
		/// The first URL to spider. All other URL's must have the
		/// same hostname as this URL. 
		/// </summary>
		private Uri m_base;

		/// <summary>
		/// The directory to save the spider output to.
		/// </summary>
		private string m_outputPath;

		/// <summary>
		/// How many URL's has the spider processed.
		/// </summary>
		private int m_urlCount = 0;

		/// <summary>
		/// When did the spider start working
		/// </summary>
		private long m_startTime = 0;

		/// <summary>
		/// Used to keep track of when the spider might be done.
		/// </summary>
		private Done m_done = new Done();		

		/// <summary>
		/// Used to tell the spider to quit.
		/// </summary>
		private bool m_quit;

		/// <summary>
		/// The status for each URL that was processed.
		/// </summary>
		enum Status { STATUS_FAILED, STATUS_SUCCESS, STATUS_QUEUED };

		/// <summary>
		/// The path for Index files.
		/// </summary>
		private string m_indexpath;


		private ArrayList arrThreads;

		/// <summary>
		/// The constructor
		/// </summary>
		public Spider()
		{
			reset();
		}

		/// <summary>
		/// Call to reset from a previous run of the spider
		/// </summary>
		public void reset()
		{
			m_already = new Hashtable();
			m_workload = new Queue();
			m_quit = false;
		}

		/// <summary>
		/// Add the specified URL to the list of URI's to spider.
		/// This is usually only used by the spider, itself, as
		/// new URL's are found.
		/// </summary>
		/// <param name="uri">The URI to add</param>
		public void addURI(Uri uri)
		{
			Monitor.Enter(this);
			if( !m_already.Contains(uri.ToString().Trim()) )
			{
				m_already.Add(uri.ToString(),Status.STATUS_QUEUED);
				m_workload.Enqueue(uri);
			}
			Monitor.Pulse(this);
			Monitor.Exit(this);
		}

		/// <summary>
		/// The URI that is to be spidered
		/// </summary>
		public Uri BaseURI 
		{
			get
			{
				return m_base;
			}

			set
			{
				m_base = value;
			}
		}

		/// <summary>
		/// The local directory to save the spidered files to
		/// </summary>
		public string OutputPath
		{
			get
			{
				return m_outputPath;
			}

			set
			{
				m_outputPath = value;
			}
		}


		/// <summary>
		/// Set to true to request the spider to quit.
		/// </summary>
		public bool Quit
		{
			get
			{
				return m_quit;
			}

			set
			{
				m_quit = value;
			}
		}

		/// <summary>
		/// Used to determine if the spider is done, 
		/// this object is usually only used internally
		/// by the spider.
		/// </summary>
		public Done SpiderDone
		{
			get
			{
				return m_done;
			}

		}

		public string IndexPath
		{
			get
			{
				return m_indexpath;
			}

			set
			{
				m_indexpath = value;
			}

		}

		/// <summary>
		/// Called by the worker threads to obtain a URL to
		/// to process.
		/// </summary>
		/// <returns>The next URL to process.</returns>
		public Uri ObtainWork()
		{
			Monitor.Enter(this);
			while(m_workload.Count<1)
			{
				Monitor.Wait(this);
			}


			Uri next = (Uri)m_workload.Dequeue();
			
			Log.ProcessURL(next.ToString());
			Log.m_Console.SetProcessedCount(""+(m_urlCount++));
			long etime = (System.DateTime.Now.Ticks-m_startTime)/10000000L;
			long urls = (etime==0)?0:m_urlCount/etime;
			Log.m_Console.SetElapsedTime( etime/60 + " minutes (" + urls +" urls/sec)" );
			

			Monitor.Exit(this);
			return next;
		}

		/// <summary>
		/// Start the spider.
		/// </summary>
		/// <param name="baseURI">The base URI to spider</param>
		/// <param name="threads">The number of threads to use</param>
		public void Start(Uri baseURI,int threads)
		{
			// init the spider
			m_quit = false;

			m_base = baseURI;
			addURI(m_base);
			m_startTime = System.DateTime.Now.Ticks;;
			m_done.Reset();
			arrThreads = new ArrayList();


			// startup the threads

			for(int i=1;i<threads;i++)
			{				
				DocumentWorker worker = new DocumentWorker(this);
				worker.Number = i;
				arrThreads.Add(worker);
				worker.start();
			}

			// now wait to be done

			m_done.WaitBegin();
			m_done.WaitDone();			
		}

		public void AcceptJob(string p_baseUrl,int threads)
		{
			//try
			//{	
			
			OutputPath = ShootSearch.Helper.Config.CacheDirectory;

			Start(new Uri(p_baseUrl) , threads);
			
			//}
			//catch{}
		}
		public void StopWork()
		{
			 ;
			this.Quit=true;
			for(int i = 0 ; i< arrThreads.Count ; i++)
			{
				((DocumentWorker)arrThreads[i]).StopWork();	
			}
			
		}

		public void setIndexPath(string p_strPath)
		{
			this.IndexPath = p_strPath;
		}

		public bool IsStoped()
		{
			return this.Quit;
		}

	}
}

spider.cs - 源码说明

本页面展示了「小型搜索软件的源代码」中的 spider.cs 源码文件，采用 CS 编程语言编写，共 300 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与搜索相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?