📄 frmmain.cs
字号:
using System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Data;
using System.IO;
using System.Xml;
namespace WebCrawlerDemo
{
/// <summary>
/// Summary description for Form1.
/// </summary>
public class frmMain : System.Windows.Forms.Form
{
private System.Windows.Forms.Label label1;
private System.Windows.Forms.TextBox txtUrl;
private System.Windows.Forms.Button btnCrawl;
private System.Windows.Forms.Label label2;
private System.Windows.Forms.TextBox txtCrawlerName;
private System.Windows.Forms.TextBox txtCrawlingLevel;
private System.Windows.Forms.Label label3;
private System.Windows.Forms.GroupBox groupBox1;
private System.Windows.Forms.CheckBox chkExtractHTML;
private System.Windows.Forms.CheckBox chkExtractBuffer;
private System.Windows.Forms.CheckBox chkDontLeaveSite;
private System.Windows.Forms.TextBox txtEmails;
private System.Windows.Forms.CheckBox chkDownloadImages;
private System.Windows.Forms.Label label4;
/// <summary>
/// Required designer variable.
/// </summary>
private System.ComponentModel.Container components = null;
public frmMain()
{
//
// Required for Windows Form Designer support
//
InitializeComponent();
//
// TODO: Add any constructor code after InitializeComponent call
//
}
/// <summary>
/// Clean up any resources being used.
/// </summary>
protected override void Dispose( bool disposing )
{
if( disposing )
{
if (components != null)
{
components.Dispose();
}
}
base.Dispose( disposing );
}
#region Windows Form Designer generated code
/// <summary>
/// Required method for Designer support - do not modify
/// the contents of this method with the code editor.
/// </summary>
private void InitializeComponent()
{
this.label1 = new System.Windows.Forms.Label();
this.txtUrl = new System.Windows.Forms.TextBox();
this.btnCrawl = new System.Windows.Forms.Button();
this.txtCrawlerName = new System.Windows.Forms.TextBox();
this.label2 = new System.Windows.Forms.Label();
this.txtCrawlingLevel = new System.Windows.Forms.TextBox();
this.label3 = new System.Windows.Forms.Label();
this.groupBox1 = new System.Windows.Forms.GroupBox();
this.chkDontLeaveSite = new System.Windows.Forms.CheckBox();
this.chkExtractBuffer = new System.Windows.Forms.CheckBox();
this.chkExtractHTML = new System.Windows.Forms.CheckBox();
this.txtEmails = new System.Windows.Forms.TextBox();
this.chkDownloadImages = new System.Windows.Forms.CheckBox();
this.label4 = new System.Windows.Forms.Label();
this.groupBox1.SuspendLayout();
this.SuspendLayout();
//
// label1
//
this.label1.Location = new System.Drawing.Point(8, 24);
this.label1.Name = "label1";
this.label1.Size = new System.Drawing.Size(32, 20);
this.label1.TabIndex = 0;
this.label1.Text = "Url";
//
// txtUrl
//
this.txtUrl.Location = new System.Drawing.Point(92, 24);
this.txtUrl.Name = "txtUrl";
this.txtUrl.Size = new System.Drawing.Size(496, 20);
this.txtUrl.TabIndex = 1;
this.txtUrl.Text = "http://www.noviway.com";
//
// btnCrawl
//
this.btnCrawl.Font = new System.Drawing.Font("Microsoft Sans Serif", 8.25F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((System.Byte)(177)));
this.btnCrawl.Location = new System.Drawing.Point(488, 376);
this.btnCrawl.Name = "btnCrawl";
this.btnCrawl.Size = new System.Drawing.Size(96, 36);
this.btnCrawl.TabIndex = 7;
this.btnCrawl.Text = "Crawl";
this.btnCrawl.Click += new System.EventHandler(this.btnCrawl_Click);
//
// txtCrawlerName
//
this.txtCrawlerName.Location = new System.Drawing.Point(92, 48);
this.txtCrawlerName.Name = "txtCrawlerName";
this.txtCrawlerName.Size = new System.Drawing.Size(496, 20);
this.txtCrawlerName.TabIndex = 2;
this.txtCrawlerName.Text = "Noviway_WebCrawler";
//
// label2
//
this.label2.Location = new System.Drawing.Point(8, 48);
this.label2.Name = "label2";
this.label2.Size = new System.Drawing.Size(84, 32);
this.label2.TabIndex = 3;
this.label2.Text = "Crawler Name";
//
// txtCrawlingLevel
//
this.txtCrawlingLevel.Location = new System.Drawing.Point(92, 84);
this.txtCrawlingLevel.Name = "txtCrawlingLevel";
this.txtCrawlingLevel.Size = new System.Drawing.Size(40, 20);
this.txtCrawlingLevel.TabIndex = 3;
this.txtCrawlingLevel.Text = "3";
//
// label3
//
this.label3.Location = new System.Drawing.Point(8, 84);
this.label3.Name = "label3";
this.label3.Size = new System.Drawing.Size(84, 32);
this.label3.TabIndex = 5;
this.label3.Text = "Crawler Levels";
//
// groupBox1
//
this.groupBox1.Controls.Add(this.chkDownloadImages);
this.groupBox1.Controls.Add(this.chkDontLeaveSite);
this.groupBox1.Controls.Add(this.chkExtractBuffer);
this.groupBox1.Controls.Add(this.chkExtractHTML);
this.groupBox1.Location = new System.Drawing.Point(8, 124);
this.groupBox1.Name = "groupBox1";
this.groupBox1.Size = new System.Drawing.Size(580, 104);
this.groupBox1.TabIndex = 8;
this.groupBox1.TabStop = false;
this.groupBox1.Text = "Web page";
//
// chkDontLeaveSite
//
this.chkDontLeaveSite.Checked = true;
this.chkDontLeaveSite.CheckState = System.Windows.Forms.CheckState.Checked;
this.chkDontLeaveSite.Location = new System.Drawing.Point(16, 64);
this.chkDontLeaveSite.Name = "chkDontLeaveSite";
this.chkDontLeaveSite.Size = new System.Drawing.Size(108, 24);
this.chkDontLeaveSite.TabIndex = 6;
this.chkDontLeaveSite.Text = "Don\'t Leave Site";
//
// chkExtractBuffer
//
this.chkExtractBuffer.Checked = true;
this.chkExtractBuffer.CheckState = System.Windows.Forms.CheckState.Checked;
this.chkExtractBuffer.Location = new System.Drawing.Point(144, 28);
this.chkExtractBuffer.Name = "chkExtractBuffer";
this.chkExtractBuffer.Size = new System.Drawing.Size(108, 24);
this.chkExtractBuffer.TabIndex = 5;
this.chkExtractBuffer.Text = "Extract Buffer";
//
// chkExtractHTML
//
this.chkExtractHTML.Checked = true;
this.chkExtractHTML.CheckState = System.Windows.Forms.CheckState.Checked;
this.chkExtractHTML.Location = new System.Drawing.Point(16, 28);
this.chkExtractHTML.Name = "chkExtractHTML";
this.chkExtractHTML.Size = new System.Drawing.Size(108, 24);
this.chkExtractHTML.TabIndex = 4;
this.chkExtractHTML.Text = "Extract HTML";
//
// txtEmails
//
this.txtEmails.Location = new System.Drawing.Point(8, 252);
this.txtEmails.Multiline = true;
this.txtEmails.Name = "txtEmails";
this.txtEmails.Size = new System.Drawing.Size(576, 108);
this.txtEmails.TabIndex = 9;
this.txtEmails.Text = "";
//
// chkDownloadImages
//
this.chkDownloadImages.Location = new System.Drawing.Point(144, 68);
this.chkDownloadImages.Name = "chkDownloadImages";
this.chkDownloadImages.Size = new System.Drawing.Size(172, 16);
this.chkDownloadImages.TabIndex = 7;
this.chkDownloadImages.Text = "Download images";
//
// label4
//
this.label4.Location = new System.Drawing.Point(12, 232);
this.label4.Name = "label4";
this.label4.Size = new System.Drawing.Size(172, 16);
this.label4.TabIndex = 10;
this.label4.Text = "Emails:";
//
// frmMain
//
this.AutoScaleBaseSize = new System.Drawing.Size(5, 13);
this.ClientSize = new System.Drawing.Size(600, 421);
this.Controls.Add(this.label4);
this.Controls.Add(this.txtEmails);
this.Controls.Add(this.groupBox1);
this.Controls.Add(this.txtCrawlingLevel);
this.Controls.Add(this.label3);
this.Controls.Add(this.txtCrawlerName);
this.Controls.Add(this.label2);
this.Controls.Add(this.btnCrawl);
this.Controls.Add(this.txtUrl);
this.Controls.Add(this.label1);
this.Name = "frmMain";
this.Text = "Noviway WebCrawler";
this.groupBox1.ResumeLayout(false);
this.ResumeLayout(false);
}
#endregion
/// <summary>
/// The main entry point for the application.
/// </summary>
[STAThread]
static void Main()
{
Application.Run(new frmMain());
}
private int m_Counter = 0;
private SortedList m_EmailList = new SortedList();
private SortedList m_ImageList = new SortedList();
private SortedList m_UrlList = new SortedList();
/// <summary>
/// New page event
/// </summary>
private void NewPageEvent(Noviway.WebCrawler.Crawler.WebPage page, int level)
{
try
{
// We only want HTML files ( not jpeg, zip .... )
if ( page.ContentType.ToLower().IndexOf("text/html") == -1 )
return;
// Save the url
m_UrlList[ page.Url ] = true;
string content = page.HTML;
ArrayList emails = new ArrayList();
Noviway.WebCrawler.Crawler.ExtractEmailAddresses(content, ref emails);
foreach (string email in emails)
{
m_EmailList[email] = true;
txtEmails.Text += Environment.NewLine + email;
}
// Download images
if ( chkDownloadImages.Checked )
{
try
{
XmlNodeList list = page.XML.GetElementsByTagName("img");
string src = string.Empty;
foreach ( XmlNode node in list )
{
src = node.Attributes["src"].Value;
// Get absolute url
src = Noviway.WebCrawler.Crawler.GetAbsoluteUrl( page.Url, src );
m_ImageList[ src ] = true;
}
}
catch
{
}
}
// If you want to get text
try
{
string text = string.Empty;
if ( page.XML != null )
text = page.XML.InnerText;
}
catch { }
if ( page.Buffer != null )
{
FileStream fs = File.Create(string.Format("{0}\\{1}.html", txtCrawlerName.Text, ++m_Counter));
fs.Write(page.Buffer, 0, page.Buffer.Length);
fs.Close();
}
}
catch
{
}
}
public void TestCrawler( )
{
Noviway.WebCrawler.Crawler crawler = new Noviway.WebCrawler.Crawler( Noviway.IO.Logger.Empty );
Directory.CreateDirectory(txtCrawlerName.Text);
crawler.MaxCrawlingLevels = Convert.ToInt32( txtCrawlingLevel.Text );
crawler.SaveBuffer = chkExtractBuffer.Checked;
crawler.SaveHTML = chkExtractHTML.Checked;
crawler.DontLeaveSite = chkDontLeaveSite.Checked;
crawler.NewPageEvent = new Noviway.WebCrawler.Crawler.NewPageCallback(NewPageEvent);
// Crawler
crawler.Crawl(txtUrl.Text);
StreamWriter sw = new StreamWriter( string.Format("{0}\\sitemap.txt", txtCrawlerName.Text ) );
foreach ( string url in m_UrlList.Keys )
sw.WriteLine( url );
sw.Close();
txtEmails.Text = "Unique emails:" + Environment.NewLine;
// Now write only the unique emails
foreach ( string email in m_EmailList.Keys )
txtEmails.Text += Environment.NewLine + email;
if ( chkDownloadImages.Checked )
{
foreach ( string source in m_ImageList.Keys )
{
System.Net.HttpWebResponse response = null;
MemoryStream memStream = new MemoryStream();
try
{
Noviway.HTTPBrowser.Browser browser = new Noviway.HTTPBrowser.Browser();
if ( browser.Navigate( new Noviway.HTTPBrowser.Browser.Stage("GET", source, string.Empty ), ref memStream, out response ) )
{
string path = string.Format("{0}/{1}", txtCrawlerName.Text, Path.GetFileName( source ) );
FileStream fs = File.Create( path );
memStream.WriteTo( fs );
fs.Close();
}
}
catch
{
}
}
}
}
private void btnCrawl_Click(object sender, System.EventArgs e)
{
TestCrawler();
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -