⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mainform.cs

📁 利用VS C#实现的网络爬虫功能
💻 CS
📖 第 1 页 / 共 5 页
字号:
						if(thread.ThreadState == ThreadState.Suspended)
							thread.Resume();
						thread.Abort();
					}
				}
				catch(Exception)
				{
				}
			}
			Monitor.Exit(this.listViewThreads);
			this.toolBarButtonContinue.Enabled = true;
			this.toolBarButtonPause.Enabled = false;
			this.buttonGo.Enabled = true;

			this.queueURLS.Clear();
			this.urlStorage.Clear();
		}

		void ThreadRunFunction()
		{
			MyWebRequest request = null;
			while(ThreadsRunning && int.Parse(Thread.CurrentThread.Name) < this.ThreadCount)
			{
				MyUri uri = DequeueUri();
				if(uri != null)
				{
					if(SleepConnectTime > 0)
						Thread.Sleep(SleepConnectTime*1000);
					ParseUri(uri, ref request);
				}
				else
					Thread.Sleep(SleepFetchTime*1000);
			}

			Monitor.Enter(this.listViewThreads);
			try
			{
				ListViewItem item = this.listViewThreads.Items[int.Parse(Thread.CurrentThread.Name)];
				if(ThreadsRunning == false)
					item.SubItems[2].Text = "Stop";
				item.ImageIndex = 0;
			}
			catch(Exception)
			{
			}
			Monitor.Exit(this.listViewThreads);
		}

		// push uri to the queue
		bool EnqueueUri(MyUri uri, bool bCheckRepetition)
		{
			// add the uri to the binary tree to check if it is duplicated or not
			if(bCheckRepetition == true && AddURL(ref uri) == false)
				return false;

			Monitor.Enter(queueURLS);
			try
			{
				// add the uri to the queue
				queueURLS.Enqueue(uri);
			}
			catch(Exception)
			{
			}
			Monitor.Exit(queueURLS);

			return true;
		}

		// pop uri from the queue
		MyUri DequeueUri()
		{
			Monitor.Enter(queueURLS);
			MyUri uri = null;
			try
			{
				uri = (MyUri)queueURLS.Dequeue();
			}
			catch(Exception)
			{
			}
			Monitor.Exit(queueURLS);
			return uri;
		}

		void RunParser()
		{
			ThreadsRunning = true;
			try
			{
				string strUri = this.comboBoxWeb.Text.Trim();
				if(Directory.Exists(strUri) == true)
					ParseFolder(strUri, 0);
				else
				{
					if(File.Exists(strUri) == false)
					{
						Normalize(ref strUri);
						this.comboBoxWeb.Text = strUri;
					}
					MyUri uri = new MyUri(strUri);
					this.EnqueueUri(uri, false);
				}
			}
			catch(Exception e)
			{
				LogError(this.comboBoxWeb.Text, e.Message);
				return;
			}

			this.toolBarButtonContinue.Enabled = false;
			this.buttonGo.Enabled = true;
		}
		
		private void Normalize(ref string strURL)
		{
			if(strURL.StartsWith("http://") == false)
				strURL = "http://"+strURL;
			if(strURL.IndexOf("/", 8) == -1)
				strURL += '/';
		}

		bool AddURL(ref MyUri uri)
		{
			foreach(string str in ExcludeHosts)
				if(str.Trim().Length > 0 && uri.Host.ToLower().IndexOf(str.Trim()) != -1)
				{
					LogError(uri.AbsoluteUri, "\r\nHost excluded as it includes reserved pattern ("+str+")");
					return false;
				}
			Monitor.Enter(urlStorage);
			bool bNew = false;
			try
			{
				string strURL = uri.AbsoluteUri;
				bNew = urlStorage.Add(ref strURL).Count == 1;
			}
			catch(Exception)
			{
			}
			Monitor.Exit(urlStorage);
			
			return bNew;
		}

		void LogCell(ref ListViewItem itemLog, int nCell, string str)
		{
			Monitor.Enter(this.listViewThreads);
			try
			{
				itemLog.SubItems[nCell].Text = str;
			}
			catch(Exception)
			{
			}
			Monitor.Exit(this.listViewThreads);
		}

		void ParseUri(MyUri uri, ref MyWebRequest request)
		{
			string strStatus = "";
			// check if connection is kept alive from previous connections or not
			if(request != null && request.response.KeepAlive)
				strStatus += "Connection live to: "+uri.Host+"\r\n\r\n";
			else
				strStatus += "Connecting: "+uri.Host+"\r\n\r\n";

			ListViewItem itemLog = null;
			Monitor.Enter(this.listViewThreads);
			try
			{	// update thread information in the threads view list
				itemLog = this.listViewThreads.Items[int.Parse(Thread.CurrentThread.Name)];
				int nDepth = uri.Depth;
				itemLog.SubItems[1].Text = nDepth.ToString();
				itemLog.ImageIndex = 1;
				itemLog.BackColor = Color.WhiteSmoke;
				// initialize status to Connect
				itemLog.SubItems[2].Text = "Connect";
				itemLog.ForeColor = Color.Red;
				itemLog.SubItems[3].Text = uri.AbsoluteUri;
				itemLog.SubItems[4].Text = "";
				itemLog.SubItems[5].Text = "";
			}
			catch(Exception)
			{
			}
			Monitor.Exit(this.listViewThreads);

			try
			{
				// create web request
				request = MyWebRequest.Create(uri, request, KeepAlive);
				// set request timeout
				request.Timeout = RequestTimeout*1000;
				// retrieve response from web request
				MyWebResponse response = request.GetResponse();
				// update status text with the request and response headers
				strStatus += request.Header+response.Header;
				
				// check for redirection
				if(response.ResponseUri.Equals(uri) == false)
				{
					// add the new uri to the queue
					this.EnqueueUri(new MyUri(response.ResponseUri.AbsoluteUri), true);
					// update status
					strStatus += "Redirected to: "+response.ResponseUri+"\r\n";
					// log current uri status
					LogUri(uri.AbsoluteUri, strStatus);
					// reset current request to avoid response socket opening case
					request = null;
					return;
				}

				// check for allowed MIME types
				if(AllMIMETypes == false && response.ContentType != null && MIMETypes.Length > 0)
				{
					string strContentType = response.ContentType.ToLower();
					int nExtIndex = strContentType.IndexOf(';');
					if(nExtIndex != -1)
						strContentType = strContentType.Substring(0, nExtIndex);
					if(strContentType.IndexOf('*') == -1 && (nExtIndex = MIMETypes.IndexOf(strContentType)) == -1)
					{
						LogError(uri.AbsoluteUri, strStatus+"\r\nUnlisted Content-Type ("+strContentType+"), check settings.");
						request = null;
						return;
					}
					// find numbers
					Match match = new Regex(@"\d+").Match(MIMETypes, nExtIndex);
					int nMin = int.Parse(match.Value)*1024;
					match = match.NextMatch();
					int nMax = int.Parse(match.Value)*1024;
					if(nMin < nMax && (response.ContentLength < nMin || response.ContentLength > nMax))
					{
						LogError(uri.AbsoluteUri, strStatus+"\r\nContentLength limit error ("+response.ContentLength+")");
						request = null;
						return;
					}
				}
				
				// check for response extention
				string[] ExtArray = { ".gif", ".jpg", ".css", ".zip", ".exe"	};
				bool bParse = true;
				foreach(string ext in ExtArray)
					if(uri.AbsoluteUri.ToLower().EndsWith(ext) == true)
					{
						bParse = false;
						break;
					}
				foreach(string ext in ExcludeFiles)
					if(ext.Trim().Length > 0 && uri.AbsoluteUri.ToLower().EndsWith(ext) == true)
					{
						bParse = false;
						break;
					}

				// construct path in the hard disk
				string strLocalPath = uri.LocalPath;
				// check if the path ends with / to can crate the file on the HD 
				if(strLocalPath.EndsWith("/") == true)
					// check if there is no query like (.asp?i=32&j=212)
					if(uri.Query == "")
						// add a default name for / ended pathes
						strLocalPath += "default.html";
				// check if the uri includes a query string
				if(uri.Query != "")
					// construct the name from the query hash value to be the same if we download it again
					strLocalPath += uri.Query.GetHashCode()+".html";
				// construct the full path folder
				string BasePath = this.Downloadfolder+"\\"+uri.Host+Path.GetDirectoryName(uri.AbsolutePath);
				// check if the folder not found
				if(Directory.Exists(BasePath) == false)
					// create the folder
					Directory.CreateDirectory(BasePath);
				// construct the full path name of the file
				string PathName = this.Downloadfolder+"\\"+uri.Host+strLocalPath.Replace("%20", " ");
				// open the output file
				FileStream streamOut = File.Open(PathName, FileMode.Create, FileAccess.Write, FileShare.ReadWrite);
				BinaryWriter writer = new BinaryWriter(streamOut);

				itemLog.SubItems[2].Text = "Download";
				itemLog.ForeColor = Color.Black;
				// receive response buffer
				string strResponse = "";
				byte[] RecvBuffer = new byte[10240];
				int nBytes, nTotalBytes = 0;
				// loop to receive response buffer
				while((nBytes = response.socket.Receive(RecvBuffer, 0, 10240, SocketFlags.None)) > 0)
				{
					// increment total received bytes
					nTotalBytes += nBytes;
					// write received buffer to file
					writer.Write(RecvBuffer, 0, nBytes);
					// check if the uri type not binary to can be parsed for refs
					if(bParse == true)
						// add received buffer to response string
						strResponse += Encoding.ASCII.GetString(RecvBuffer, 0, nBytes);
					// update view text
					itemLog.SubItems[4].Text = Commas(nTotalBytes);
					if(response.ContentLength > 0)
						itemLog.SubItems[5].Text = '%'+(100-(response.ContentLength-nTotalBytes)*100/response.ContentLength).ToString();
					// check if connection Keep-Alive to can break the loop if response completed
					if(response.KeepAlive && nTotalBytes >= response.ContentLength && response.ContentLength > 0)
						break;
				}
				// close output stream
				writer.Close();
				streamOut.Close();
				
				if(response.KeepAlive)
					strStatus += "Connection kept alive to be used in subpages.\r\n";
				else
				{
					// close response
					response.Close();
					strStatus += "Connection closed.\r\n";
				}
				// update status
				strStatus += Commas(nTotalBytes)+" bytes, downloaded to \""+PathName+"\"\r\n";
				// increment total file count
				FileCount++;
				// increment total bytes count
				ByteCount += nTotalBytes;

				if(ThreadsRunning == true && bParse == true && uri.Depth < WebDepth)
				{
					strStatus += "\r\nParsing page ...\r\n";

					// check for restricted words
					foreach(string strExcludeWord in ExcludeWords)
						if(strExcludeWord.Trim().Length > 0 && strResponse.IndexOf(strExcludeWord) != -1)
						{
							LogError(uri.AbsoluteUri, strStatus+"\r\nPage includes reserved word ("+strExcludeWord+")");
							EraseItem(itemLog);
							File.Delete(PathName);
							return;
						}			

					// parse the page to search for refs
					string strRef = @"(href|HREF|src|SRC)[ ]*=[ ]*[""'][^""'#>]+[""']";
					MatchCollection matches = new Regex(strRef).Matches(strResponse);
					strStatus += "Found: "+matches.Count+" ref(s)\r\n";
					URLCount += matches.Count;
					foreach(Match match in matches)
					{
						strRef = match.Value.Substring(match.Value.IndexOf('=')+1).Trim('"', '\'', '#', ' ', '>');
						try
						{
							if(strRef.IndexOf("..") != -1 || strRef.StartsWith("/"

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -