⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 frmmain.cs

📁 A web crawler (also known as a web spider or web robot) is a program or automated script which brow
💻 CS
字号:
using System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Data;
using System.IO;
using System.Xml;

namespace WebCrawlerDemo
{
	/// <summary>
	/// Summary description for Form1.
	/// </summary>
	public class frmMain : System.Windows.Forms.Form
	{
		private System.Windows.Forms.Label label1;
		private System.Windows.Forms.TextBox txtUrl;
		private System.Windows.Forms.Button btnCrawl;
		private System.Windows.Forms.Label label2;
		private System.Windows.Forms.TextBox txtCrawlerName;
		private System.Windows.Forms.TextBox txtCrawlingLevel;
		private System.Windows.Forms.Label label3;
		private System.Windows.Forms.GroupBox groupBox1;
		private System.Windows.Forms.CheckBox chkExtractHTML;
		private System.Windows.Forms.CheckBox chkExtractBuffer;
        private System.Windows.Forms.CheckBox chkDontLeaveSite;
		private System.Windows.Forms.TextBox txtEmails;
		private System.Windows.Forms.CheckBox chkDownloadImages;
		private System.Windows.Forms.Label label4;
		/// <summary>
		/// Required designer variable.
		/// </summary>
		private System.ComponentModel.Container components = null;

		public frmMain()
		{
			//
			// Required for Windows Form Designer support
			//
			InitializeComponent();

			//
			// TODO: Add any constructor code after InitializeComponent call
			//
		}

		/// <summary>
		/// Clean up any resources being used.
		/// </summary>
		protected override void Dispose( bool disposing )
		{
			if( disposing )
			{
				if (components != null) 
				{
					components.Dispose();
				}
			}
			base.Dispose( disposing );
		}

		#region Windows Form Designer generated code
		/// <summary>
		/// Required method for Designer support - do not modify
		/// the contents of this method with the code editor.
		/// </summary>
		private void InitializeComponent()
		{
			this.label1 = new System.Windows.Forms.Label();
			this.txtUrl = new System.Windows.Forms.TextBox();
			this.btnCrawl = new System.Windows.Forms.Button();
			this.txtCrawlerName = new System.Windows.Forms.TextBox();
			this.label2 = new System.Windows.Forms.Label();
			this.txtCrawlingLevel = new System.Windows.Forms.TextBox();
			this.label3 = new System.Windows.Forms.Label();
			this.groupBox1 = new System.Windows.Forms.GroupBox();
			this.chkDontLeaveSite = new System.Windows.Forms.CheckBox();
			this.chkExtractBuffer = new System.Windows.Forms.CheckBox();
			this.chkExtractHTML = new System.Windows.Forms.CheckBox();
			this.txtEmails = new System.Windows.Forms.TextBox();
			this.chkDownloadImages = new System.Windows.Forms.CheckBox();
			this.label4 = new System.Windows.Forms.Label();
			this.groupBox1.SuspendLayout();
			this.SuspendLayout();
			// 
			// label1
			// 
			this.label1.Location = new System.Drawing.Point(8, 24);
			this.label1.Name = "label1";
			this.label1.Size = new System.Drawing.Size(32, 20);
			this.label1.TabIndex = 0;
			this.label1.Text = "Url";
			// 
			// txtUrl
			// 
			this.txtUrl.Location = new System.Drawing.Point(92, 24);
			this.txtUrl.Name = "txtUrl";
			this.txtUrl.Size = new System.Drawing.Size(496, 20);
			this.txtUrl.TabIndex = 1;
			this.txtUrl.Text = "http://www.noviway.com";
			// 
			// btnCrawl
			// 
			this.btnCrawl.Font = new System.Drawing.Font("Microsoft Sans Serif", 8.25F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((System.Byte)(177)));
			this.btnCrawl.Location = new System.Drawing.Point(488, 376);
			this.btnCrawl.Name = "btnCrawl";
			this.btnCrawl.Size = new System.Drawing.Size(96, 36);
			this.btnCrawl.TabIndex = 7;
			this.btnCrawl.Text = "Crawl";
			this.btnCrawl.Click += new System.EventHandler(this.btnCrawl_Click);
			// 
			// txtCrawlerName
			// 
			this.txtCrawlerName.Location = new System.Drawing.Point(92, 48);
			this.txtCrawlerName.Name = "txtCrawlerName";
			this.txtCrawlerName.Size = new System.Drawing.Size(496, 20);
			this.txtCrawlerName.TabIndex = 2;
			this.txtCrawlerName.Text = "Noviway_WebCrawler";
			// 
			// label2
			// 
			this.label2.Location = new System.Drawing.Point(8, 48);
			this.label2.Name = "label2";
			this.label2.Size = new System.Drawing.Size(84, 32);
			this.label2.TabIndex = 3;
			this.label2.Text = "Crawler Name";
			// 
			// txtCrawlingLevel
			// 
			this.txtCrawlingLevel.Location = new System.Drawing.Point(92, 84);
			this.txtCrawlingLevel.Name = "txtCrawlingLevel";
			this.txtCrawlingLevel.Size = new System.Drawing.Size(40, 20);
			this.txtCrawlingLevel.TabIndex = 3;
			this.txtCrawlingLevel.Text = "3";
			// 
			// label3
			// 
			this.label3.Location = new System.Drawing.Point(8, 84);
			this.label3.Name = "label3";
			this.label3.Size = new System.Drawing.Size(84, 32);
			this.label3.TabIndex = 5;
			this.label3.Text = "Crawler Levels";
			// 
			// groupBox1
			// 
			this.groupBox1.Controls.Add(this.chkDownloadImages);
			this.groupBox1.Controls.Add(this.chkDontLeaveSite);
			this.groupBox1.Controls.Add(this.chkExtractBuffer);
			this.groupBox1.Controls.Add(this.chkExtractHTML);
			this.groupBox1.Location = new System.Drawing.Point(8, 124);
			this.groupBox1.Name = "groupBox1";
			this.groupBox1.Size = new System.Drawing.Size(580, 104);
			this.groupBox1.TabIndex = 8;
			this.groupBox1.TabStop = false;
			this.groupBox1.Text = "Web page";
			// 
			// chkDontLeaveSite
			// 
			this.chkDontLeaveSite.Checked = true;
			this.chkDontLeaveSite.CheckState = System.Windows.Forms.CheckState.Checked;
			this.chkDontLeaveSite.Location = new System.Drawing.Point(16, 64);
			this.chkDontLeaveSite.Name = "chkDontLeaveSite";
			this.chkDontLeaveSite.Size = new System.Drawing.Size(108, 24);
			this.chkDontLeaveSite.TabIndex = 6;
			this.chkDontLeaveSite.Text = "Don\'t Leave Site";
			// 
			// chkExtractBuffer
			// 
			this.chkExtractBuffer.Checked = true;
			this.chkExtractBuffer.CheckState = System.Windows.Forms.CheckState.Checked;
			this.chkExtractBuffer.Location = new System.Drawing.Point(144, 28);
			this.chkExtractBuffer.Name = "chkExtractBuffer";
			this.chkExtractBuffer.Size = new System.Drawing.Size(108, 24);
			this.chkExtractBuffer.TabIndex = 5;
			this.chkExtractBuffer.Text = "Extract Buffer";
			// 
			// chkExtractHTML
			// 
			this.chkExtractHTML.Checked = true;
			this.chkExtractHTML.CheckState = System.Windows.Forms.CheckState.Checked;
			this.chkExtractHTML.Location = new System.Drawing.Point(16, 28);
			this.chkExtractHTML.Name = "chkExtractHTML";
			this.chkExtractHTML.Size = new System.Drawing.Size(108, 24);
			this.chkExtractHTML.TabIndex = 4;
			this.chkExtractHTML.Text = "Extract HTML";
			// 
			// txtEmails
			// 
			this.txtEmails.Location = new System.Drawing.Point(8, 252);
			this.txtEmails.Multiline = true;
			this.txtEmails.Name = "txtEmails";
			this.txtEmails.Size = new System.Drawing.Size(576, 108);
			this.txtEmails.TabIndex = 9;
			this.txtEmails.Text = "";
			// 
			// chkDownloadImages
			// 
			this.chkDownloadImages.Location = new System.Drawing.Point(144, 68);
			this.chkDownloadImages.Name = "chkDownloadImages";
			this.chkDownloadImages.Size = new System.Drawing.Size(172, 16);
			this.chkDownloadImages.TabIndex = 7;
			this.chkDownloadImages.Text = "Download images";
			// 
			// label4
			// 
			this.label4.Location = new System.Drawing.Point(12, 232);
			this.label4.Name = "label4";
			this.label4.Size = new System.Drawing.Size(172, 16);
			this.label4.TabIndex = 10;
			this.label4.Text = "Emails:";
			// 
			// frmMain
			// 
			this.AutoScaleBaseSize = new System.Drawing.Size(5, 13);
			this.ClientSize = new System.Drawing.Size(600, 421);
			this.Controls.Add(this.label4);
			this.Controls.Add(this.txtEmails);
			this.Controls.Add(this.groupBox1);
			this.Controls.Add(this.txtCrawlingLevel);
			this.Controls.Add(this.label3);
			this.Controls.Add(this.txtCrawlerName);
			this.Controls.Add(this.label2);
			this.Controls.Add(this.btnCrawl);
			this.Controls.Add(this.txtUrl);
			this.Controls.Add(this.label1);
			this.Name = "frmMain";
			this.Text = "Noviway WebCrawler";
			this.groupBox1.ResumeLayout(false);
			this.ResumeLayout(false);

		}
		#endregion

		/// <summary>
		/// The main entry point for the application.
		/// </summary>
		[STAThread]
		static void Main() 
		{
			Application.Run(new frmMain());
		}

		private int m_Counter = 0;
        private SortedList m_EmailList = new SortedList();
		private SortedList m_ImageList = new SortedList();
		private SortedList m_UrlList = new SortedList();

		/// <summary>
		/// New page event
		/// </summary>
		private void NewPageEvent(Noviway.WebCrawler.Crawler.WebPage page, int level)
		{
			
			try
			{

				// We only want HTML files ( not jpeg, zip .... )
				if ( page.ContentType.ToLower().IndexOf("text/html") == -1 )
					return;

				// Save the url
				m_UrlList[ page.Url ] = true;

				string content = page.HTML;

                ArrayList emails = new ArrayList();

                Noviway.WebCrawler.Crawler.ExtractEmailAddresses(content, ref emails);

				foreach (string email in emails)
				{
					m_EmailList[email] = true;

					txtEmails.Text += Environment.NewLine + email;
				}

				// Download images
				if ( chkDownloadImages.Checked )
				{
					try
					{
						XmlNodeList list = page.XML.GetElementsByTagName("img");

						string src = string.Empty;

						foreach ( XmlNode node in list )
						{
							src = node.Attributes["src"].Value;	

							// Get absolute url
							src = Noviway.WebCrawler.Crawler.GetAbsoluteUrl( page.Url, src );

							m_ImageList[ src ] = true;
						}
					}
					catch
					{
					}
				}

				// If you want to get text
				try
				{
					string text = string.Empty;

					if ( page.XML != null )
						text = page.XML.InnerText;
				}
				catch { }
			
				if ( page.Buffer != null )
				{
					FileStream fs = File.Create(string.Format("{0}\\{1}.html", txtCrawlerName.Text, ++m_Counter));

					fs.Write(page.Buffer, 0, page.Buffer.Length);

					fs.Close();
				}
			}
			catch
			{
			}
		}

		public void TestCrawler(  )
		{
			Noviway.WebCrawler.Crawler crawler = new Noviway.WebCrawler.Crawler( Noviway.IO.Logger.Empty );

			Directory.CreateDirectory(txtCrawlerName.Text);

			
			crawler.MaxCrawlingLevels = Convert.ToInt32( txtCrawlingLevel.Text );
			crawler.SaveBuffer = chkExtractBuffer.Checked;
			crawler.SaveHTML = chkExtractHTML.Checked;
			crawler.DontLeaveSite = chkDontLeaveSite.Checked;
			crawler.NewPageEvent = new Noviway.WebCrawler.Crawler.NewPageCallback(NewPageEvent);

			// Crawler
			crawler.Crawl(txtUrl.Text);
                

			StreamWriter sw = new StreamWriter( string.Format("{0}\\sitemap.txt", txtCrawlerName.Text ) );

			foreach ( string url in m_UrlList.Keys )
				sw.WriteLine( url );

			sw.Close();

			txtEmails.Text = "Unique emails:" + Environment.NewLine;

			// Now write only the unique emails
			foreach ( string email in m_EmailList.Keys )
				txtEmails.Text += Environment.NewLine + email;

			if ( chkDownloadImages.Checked )
			{
				foreach ( string source in m_ImageList.Keys )
				{
					System.Net.HttpWebResponse response = null;
					MemoryStream memStream = new MemoryStream();

					try
					{
						Noviway.HTTPBrowser.Browser browser = new Noviway.HTTPBrowser.Browser();

						if ( browser.Navigate( new Noviway.HTTPBrowser.Browser.Stage("GET", source, string.Empty ), ref memStream, out response ) )
						{
							string path = string.Format("{0}/{1}", txtCrawlerName.Text, Path.GetFileName( source ) );

							FileStream fs = File.Create( path );

							memStream.WriteTo( fs );

							fs.Close();
						}
					}
					catch
					{
					}
				}
			}
		}

		private void btnCrawl_Click(object sender, System.EventArgs e)
		{
			TestCrawler();
		
		}

		
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -