documentworker.cs

来自「小型搜索软件的源代码」· CS 代码 · 共 557 行
557 行
using System;
using System.Net;
using System.IO;
using System.Threading;
using System.Text;
using ShootSearch.IFilter;
using ShootSearch.Plugin;
using ShootSearch.Indexing;
using ShootSearch.Logging;
using System.Security.Cryptography;
using ShootSearch.Helper;
using System.Text.RegularExpressions;

namespace ShootSearch.Spiders.http
{
	/// <summary>
	/// Perform all of the work of a single thread for the spider.
	/// This involves waiting for a URL to becomve available, download
	/// and then processing the page.
	/// 
	/// </summary>
	// 完成必须由单个工作线程执行的操作，包括
	// 等待可用的URL，下载和处理页面
	public class DocumentWorker
	{
		/// <summary>
		/// The base URI that is to be spidered.
		/// </summary>
		// 要扫描的基础URI
		private Uri m_uri;

		/// <summary>
		/// The spider that this thread "works for"
		/// </summary>
		// 
		private Spider m_spider;

		/// <summary>
		/// The thread that is being used.
		/// </summary>
		private Thread m_thread;

		/// <summary>
		/// The thread number, used to identify this worker.
		/// </summary>
		// 线程编号，用来标识当前的工作线程
		private int m_number;

		/// <summary>
		/// The name for default documents.
		/// </summary>
		// 缺省文档的名字
		public const string IndexFile = "index.html";
	
		/// <summary>
		/// Constructor.
		/// </summary>
		/// <param name="spider">The spider that owns this worker.</param>
		// 构造函数，参数表示拥有当前工作线程的蜘蛛程序
		public DocumentWorker(Spider spider)
		{
			m_spider = spider;
		}


		private Encoding encoding;

		

		#region Convert uri to filename
		/// <summary>
		/// This method will take a URI name, such ash /images/blank.gif
		/// and convert it into the name of a file for local storage.
		/// If the directory structure to hold this file does not exist, it
		/// will be created by this method.
		/// </summary>
		/// <param name="uri">The URI of the file about to be stored</param>
		/// <returns></returns>
		// 输入参数是一个URI名称，例如/images/blank.gif.
		// 把它转换成本地文件名称。如果尚未创建相应的目录
		// 结构，则创建之
		private string convertFilename(Uri uri , bool p_Encode)
		{
			string result = m_spider.OutputPath + "\\" + uri.Host.Replace(".","_");
			
			//处理动态网页时需要后缀名的转换
			uri = new Uri(System.Web.HttpUtility.UrlDecode(uri.ToString()));
			string fileparameter = uri.Query;
			fileparameter = Tools.formatFilename(fileparameter);

			int index1;
			int index2;			

			// add ending slash if needed
			if( result[result.Length-1]!='\\' )
				result = result+"\\";

			// strip the query if needed

			String path = uri.PathAndQuery;
			int queryIndex = path.IndexOf("?");
			if( queryIndex!=-1 )
				path = path.Substring(0,queryIndex);

			// see if an ending / is missing from a directory only
			
			int lastSlash = path.LastIndexOf('/');
			int lastDot = path.LastIndexOf('.');

			if( path[path.Length-1]!='/' )
			{
				if(lastSlash>lastDot)
					path+="/"+IndexFile;
			}

			// determine actual filename		
			lastSlash = path.LastIndexOf('/');

			string filename = "";
			if(lastSlash!=-1)
			{
				filename=path.Substring(1+lastSlash);
				path = path.Substring(0,1+lastSlash);
				if(filename.Equals("") )
					filename=IndexFile;
			}

			// 必要时创建目录结构			
			index1 = 1;
			do
			{
				index2 = path.IndexOf('/',index1);
				if(index2!=-1)
				{
					String dirpart = path.Substring(index1,index2-index1);
					result+=dirpart;
					result+="\\";
				
				
					Directory.CreateDirectory(result);

					index1 = index2+1;					
				}
			} while(index2!=-1);			

			// attach name
			string newName;
			newName = result + filename + fileparameter;
			byte [] data = Encoding.ASCII.GetBytes(newName); 
			byte [] rt;

			SHA1 sha = new SHA1CryptoServiceProvider();
			rt = sha.ComputeHash(data);
			if(p_Encode)
			{
				newName = Convert.ToBase64String(rt) ;
				newName = Tools.formatFilename(newName);
				result = result + newName + ".htm" ;

			}
			else
			{
				result = newName;
			}
			return result;
		}


		#endregion

		#region Save a binary file to disk.
		/// <summary>
		/// Save a binary file to disk.
		/// </summary>
		/// <param name="response">The response used to save the file</param>
		// 将二进制文件保存到磁盘
		private void SaveBinaryFile(WebResponse response)
		{

			byte []buffer = new byte[1024];

			if( m_spider.OutputPath==null )
				return;

			string filename = convertFilename( response.ResponseUri ,false);
			string fileExtension = filename.Substring(filename.LastIndexOf("."));
			
			if(!Config.FileTypes.Contains(fileExtension.ToUpper()))
			{
				//如果文件列表不包含这个文件类型
				//退出
				//Console.WriteLine( "=======" + fileExtension.ToUpper() );
				return ;
				
			}

			Stream outStream = File.Create( filename );
			Stream inStream = response.GetResponseStream();	
			
			int l;
			do
			{
				l = inStream.Read(buffer,0,buffer.Length);
				if(l>0)
					outStream.Write(buffer,0,l);
			}
			while(l>0);
			
			outStream.Close();
			inStream.Close();
			IndexReptile.Add(m_uri,filename,Tools.GetUrlName(m_uri));
		}

		#endregion

		#region Save a text file.
		/// <summary>
		/// Save a text file.
		/// </summary>
		/// <param name="buffer">The text to save</param>
		// 保存文本文件
		private void SaveTextFile(string buffer)
		{
			try
			{
				if( m_spider.OutputPath==null )
					return;

				
				string filename =m_uri.ToString().Substring(m_uri.ToString().Length -4).ToLower();
				if(filename == ".css" || filename == ".xml")
					filename = convertFilename( m_uri , false);
				else
					filename = convertFilename( m_uri , true);
				//buffer=StripHTML(buffer);
				//Console.WriteLine(buffer);
				
				StreamWriter outStream = new StreamWriter( filename ,false ,encoding );
				outStream.Write(buffer);
				outStream.Close();
				IndexReptile.Add(m_uri,filename,getTitle(buffer));

			}
			catch{}
		}
		#endregion

		#region Download a page

		/// <summary>
		/// Download a page
		/// </summary>
		/// <returns>The data downloaded from the page</returns>
		// 下载一个页面
		private string GetPage()
		{
//			GetPageCode gpc = new GetPageCode();
//			gpc.Url=m_uri.ToString();
//			gpc.ProxyState=0;//使用代理服务器，0为不使用，设置为1后下面的代理设置才起作用
//			//gpc.OutFilePath=filePath;//设置输出文件路径的地方，如果不设置，则返回字符串
//			gpc.GetSource();//处理
//			Console.WriteLine(gpc.OutString);
//			return gpc.OutString;

			#region 

			WebResponse response = null;
			Stream stream = null;
			StreamReader reader = null;

			try
			{
				HttpWebRequest request = (HttpWebRequest)WebRequest.Create(m_uri);
				request.Timeout = 60000;
				response = request.GetResponse();
				stream = response.GetResponseStream();					

				if( !response.ContentType.ToLower().StartsWith("text/") )
				{
					SaveBinaryFile(response);
					return null;
				}


				reader = new StreamReader(stream,System.Text.Encoding.Default);
				string buffer ="" ;// = reader.ReadToEnd();//,line;
				string line ;
				while( (line = reader.ReadLine())!=null )
				{
					buffer+=line+"\r\n";
				}

				string strEncoding = Tools.GetEncoding(buffer);
				//System.Text.Encoding encoding;
				if(strEncoding == "UTF-8")
					encoding  = Encoding.UTF8 ;
				else if (strEncoding == "UTF-7")
					encoding  = Encoding.UTF7 ;
				else if (strEncoding == "UNICODE")
					encoding  = Encoding.Unicode;
				else
					encoding = Encoding.Default;
				
				if (encoding != Encoding.Default)
				{
					request.Timeout = 60000;
					request = (HttpWebRequest)WebRequest.Create(m_uri);
					response = request.GetResponse();
					stream = response.GetResponseStream();	
					reader = new StreamReader(stream,encoding);
					buffer=reader.ReadToEnd();
				}

				return buffer;
			}
			catch(WebException e)
			{
				Log.ResultURL("下载失败，错误：" + e);
				return null;
			}
			catch(IOException e)
			{
				Log.ResultURL("下载失败，错误：" + e);
				return null;
			}
			finally
			{
				if( reader!=null ) reader.Close();
				if( stream!=null ) stream.Close();
				if( response!=null ) response.Close();
			}
			#endregion
		}
		#endregion

		#region ProcessLink
		/// <summary>
		/// Process each link encountered. The link will be recorded
		/// for later spidering if it is an http or https docuent, 
		/// has not been visited before(determined by spider class),
		/// and is in the same host as the original base URL.
		/// </summary>
		/// <param name="link">The URL to process</param>
		private string ProcessLink(string link)
		{
			Uri url;
			Log.Debug(link);

			if(link == "#")
				return link;

			while(link.LastIndexOf("#") > link.LastIndexOf("/") )
			{
				link = link.Substring(0,link.LastIndexOf("#"));
			}
			

			link= link.Replace("&amp;","&");
			// fully expand this URL if it was a relative link
			try
			{
				//if(link.IndexOf("http")>=0 && !Config.OnlyStartURL)
				//{
				//	url = new Uri(link);

				//}
				//else
				{
					url = new Uri(m_uri,link,true);
				}
			}
			catch(UriFormatException e)
			{
				Log.ResultURL( "Invalid URI:" + link +" Error:" + e.Message);
				return link;
			}

			if(!url.Scheme.ToLower().Equals("http") &&
				!url.Scheme.ToLower().Equals("https") )
				return link;

			// comment out this line if you would like to spider
			// the whole Internet (yeah right, but it will try)
			if(!Config.OnlyStartURL)
				if( !url.Host.ToLower().Equals( m_uri.Host.ToLower() ) )
					return link;

			//System.Console.WriteLine( "Queue:"+url );
			//Console.WriteLine(url.ToString() + "==" + (this.m_spider.SpiderName));
			if(url.ToString().IndexOf(this.m_spider.BaseURI.ToString()) >= 0)
			{
				m_spider.addURI( url );

			}
			else
			{
				Log.Debug(url);
			}

			return url.ToString();


		}

		#endregion

		#region ProcessPage
		/// <summary>
		/// Process a URL
		/// </summary>
		/// <param name="page">the URL to process</param>
		private string ProcessPage(string page)
		{
			ParseHTML parse = new ParseHTML();
			parse.Source = page;

			while(!parse.Eof())
			{
				char ch = parse.Parse();
				
				if(ch==0)
				{		
					Attribute a = parse.GetTag()["HREF"];
					if( a!=null )
					{
						
						string newValue = ProcessLink(a.Value);
						//Console.WriteLine(parse.AdvanceCurrentChar());
						//string oldTag = parse.BuildTag();

						//string newTag = oldTag.Replace(a.Value,newValue);
						page = page.Replace( "\"" + a.Value + "\"" , "\"" +newValue + "\"" );		
						page = page.Replace( "'" + a.Value + "'" , "'" +newValue + "'" );		
						page = page.Replace( "=" + a.Value , "=" +newValue );
					}
					a = parse.GetTag()["SRC"];
					if( a!=null )
					{
						string newValue = ProcessLink(a.Value);
						//Console.WriteLine(parse.AdvanceCurrentChar());
						//string oldTag = parse.BuildTag();

						//string newTag = oldTag.Replace(a.Value,newValue);
						page = page.Replace( "\"" + a.Value + "\"" , "\"" +newValue + "\"" );		
						page = page.Replace( "'" + a.Value + "'" , "'" +newValue + "'" );		
						page = page.Replace( "=" + a.Value , "=" +newValue );

						//Console.WriteLine(oldTag + "0000000000" + newTag);
					}
					//newpage += parse.BuildTag();
	
				//Console.WriteLine(parse.Source);
				}
				
			}
			return page;
		}

		#endregion

		#region Process()
		/// <summary>
		/// This method is the main loop for the spider threads.
		/// This method will wait for URL's to become available, 
		/// and then process them. 
		/// </summary>
		public void Process()
		{
			while(!m_spider.Quit )
			{
				m_uri = m_spider.ObtainWork();
				
				m_spider.SpiderDone.WorkerBegin();
				//if(m_spider.ReportTo !=null)
				//	m_spider.ReportTo.SetLastURL("Download("+this.Number+"):"+m_uri);
				Log.ResultURL("Download("+this.Number+"):"+m_uri);			
				string page = GetPage();
				if(page!=null)
				{

					page = ProcessPage(page);
					SaveTextFile(page);
				}
				m_spider.SpiderDone.WorkerEnd();
			}
			while(m_spider.Quit)
				Thread.CurrentThread.Abort();
		}

		#endregion

		#region Start the thread.
		/// <summary>
		/// Start the thread.
		/// </summary>
		public void start()
		{
			ThreadStart ts = new ThreadStart( this.Process );
			m_thread = new Thread(ts);
			m_thread.Start();
			m_thread.Name = "Shoot-Spider";
		}
		#endregion

		#region The thread number. Used only to identify this thread.
		/// <summary>
		/// The thread number. Used only to identify this thread.
		/// </summary>
		public int Number 
		{
			get
			{
				return m_number;
			}

			set
			{
				m_number = value;
			}
		
		}
		#endregion

		#region Finds a title of HTML file.
		/// <summary>
		/// Finds a title of HTML file. Doesn't work if the title spans two or more lines.
		/// </summary>
		/// <param name="html">HTML document source</param>
		/// <returns>Title string.</returns>
		private string getTitle(string html)
		{
			Match m = Regex.Match(html, "<title>(.*)</title>");
			if (m.Groups.Count == 2)
				return m.Groups[1].Value;
			return "(unknown)";
		}
		#endregion

		#region StopWork
		/// <summary>
		/// StopWork
		/// </summary>
		public void StopWork()
		{
			try
			{
				this.m_thread.Abort();
			}
			catch
			{}
		}
		#endregion

	}
}
documentworker.cs - 源码说明

本页面展示了「小型搜索软件的源代码」中的 documentworker.cs 源码文件，采用 CS 编程语言编写，共 557 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与搜索相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?