📄 crawler.cs

📁 A web crawler (also known as a web spider or web robot) is a program or automated script which brow
💻 CS
字号:
using System;
using System.Text;
using System.Xml;
using System.IO;
using System.Collections;
using System.Net;

namespace Noviway
{
	namespace WebCrawler
	{
		public class Crawler
		{
            private static void __NewPageEvent(Crawler.WebPage page, int level)
            {
                string content = page.HTML;

                ArrayList emails = new ArrayList();
                
                Crawler.ExtractEmailAddresses(content, ref emails);

                foreach (string email in emails)
                    Console.WriteLine(email);

            }

			static void Main()
			{
				Noviway.WebCrawler.Crawler crawler = new Noviway.WebCrawler.Crawler( Noviway.IO.Logger.Empty);

				crawler.MaxCrawlingLevels = 2;
				crawler.SaveBuffer = true;
				crawler.SaveHTML = true;
                crawler.DontLeaveSite = true;
                crawler.NewPageEvent = new Crawler.NewPageCallback(__NewPageEvent);

                crawler.Crawl("http://www.noviway.com");
			}

			/// <summary>
			/// A new page has been found
			/// </summary>
			public delegate void NewPageCallback(WebPage page, int level);
    

			/// <summary>
			/// Web page
			/// </summary>
			public class WebPage
			{
				/// <summary>
				/// Url of the page
				/// </summary>
				public string Url;

				/// <summary>
				/// Xml version of this page
				/// </summary>
				public XmlDocument XML;

				/// <summary>
				/// Original page buffer
				/// </summary>
				public byte[] Buffer;

				/// <summary>
				/// HTML content
				/// </summary>
				public string HTML;

				/// <summary>
				/// Encoding of the page
				/// </summary>
				public Encoding PageEncoding;

				/// <summary>
				/// Content type of the page
				/// </summary>
				public string ContentType;

				public WebPage()
				{
				}

				public WebPage(string url, XmlDocument document, byte[] buffer, Encoding encoding, string html, string contentType)
				{
					this.Url = url;
					this.XML = document;
					this.Buffer = buffer;
					this.PageEncoding = encoding;
					this.HTML = html;
					this.ContentType = contentType;
				}
			}

			/// <summary>
			/// We only visit a page once
			/// </summary>
			private Hashtable m_UrlVisited = new Hashtable();

			/// <summary>
			/// Password for the Html To Xml componenet
			/// </summary>
			private readonly string m_HtmlToXmlPassword;

			/// <summary>
			/// HTTP browser, we use it globally so we can use the cookie and save up time
			/// </summary>
			Noviway.HTTPBrowser.Browser m_Browser = new Noviway.HTTPBrowser.Browser();

			/// <summary>
			/// Domain
			/// </summary>
			private string m_Domain = string.Empty;

			private readonly Noviway.IO.Logger m_Logger;

			#region Rules
			
			private int m_MaxCrawlingLevels = 3;
			private bool m_SaveBuffer = false;
			private bool m_SaveHTML = false;
			private bool m_DontLeaveSite = true;
            
			#endregion

			private NewPageCallback m_NewPageEvent = null;

            
			/// <summary>
			/// Don't leave site
			/// </summary>
			public bool DontLeaveSite
			{
				set
				{
					m_DontLeaveSite = value;
				}
				get
				{
					return m_DontLeaveSite;
				}
			}

			/// <summary>
			/// New page event
			/// </summary>
			public NewPageCallback NewPageEvent
			{
				set
				{
					m_NewPageEvent = value;
				}
			}

			/// <summary>
			/// Maximum crawling levels
			/// </summary>
			public int MaxCrawlingLevels
			{
				get
				{
					return m_MaxCrawlingLevels;
				}
				set
				{
					m_MaxCrawlingLevels = value;
				}
			}

			/// <summary>
			/// Should we save HTML
			/// </summary>
			public bool SaveHTML
			{
				get
				{
					return m_SaveHTML;
				}
				set
				{
					m_SaveHTML = value;
				}
			}

			/// <summary>
			/// Should we save the buffer
			/// </summary>
			public bool SaveBuffer
			{
				get
				{
					return m_SaveBuffer;
				}
				set
				{
					m_SaveBuffer = value;
				}
			}

			

			/// <summary>
			/// Crawler
			/// </summary>
			public Crawler( Noviway.IO.Logger logger)
			{
				m_Logger = logger;
			}

			public Crawler( string html2XmlPassword )
			{
				m_HtmlToXmlPassword = html2XmlPassword;
			}

			/// <summary>
			/// Get HTML from a website
			/// </summary>
			bool GetHTMLFromSite(string url, ref string html, out System.Text.Encoding encoding, string saveFileName, out byte[] buffer, ref string contentType )
			{
				encoding = null;
				buffer = null;

				

				bool result = false;

				// The memory stream will contain the binary HTML
				// It's faster to use the MemoryStream object inside because we don't fetch the data at one time but we make
				// a few fetch calls
				MemoryStream memStream = new MemoryStream();
				HttpWebResponse response = null;

				// Navigate to the desired location
				result = m_Browser.Navigate(new Noviway.HTTPBrowser.Browser.Stage("GET", url, string.Empty), ref memStream, out response);

				if (!result)
					return false;

				if ( response != null )
					contentType = response.ContentType;

				// Only now, convert to byte array
				buffer = memStream.ToArray();

				// Get a unicode string instead of the buffer. The encoding of the HTML does not matter, we take care
				// of it inside.
				result = Noviway.HTTPBrowser.Browser.GetHTMLString(buffer, out encoding, ref html);

				// Close stream
				memStream.Close();

				if (saveFileName.Length > 0)
				{
					FileStream fs = File.Create(saveFileName);
					fs.Write(buffer, 0, buffer.Length);
					fs.Close();
				}
				return result;

			}
			
			/// <summary>
			/// Get url without the file
			/// </summary>
			public static string GetUrlWithoutFile( string url )
			{
				int lastSlashPos = url.LastIndexOf('/');
				string newUrl = string.Empty;

				if ( lastSlashPos == -1 )
					return url;

				if (lastSlashPos > 7)
					newUrl = url.Substring(0, lastSlashPos);
				else
					newUrl = url;

				return newUrl;
			}


			/// <summary>
			/// Get domain
			/// </summary>
			public static string GetDomain( string url )
			{
				int pos = url.LastIndexOf("://");

				if ( pos != -1 )
					url = url.Substring( pos + 3 );
				
				pos = url.IndexOf('/');

				if ( pos == -1 )
					return url;

				return url.Substring( 0, pos );

			}

			/// <summary>
			/// Get absolute url
			/// </summary>
			public static string GetAbsoluteUrl( string originalUrl, string url )
			{
				if ( !url.StartsWith("http://") )
				{
					string newUrl = string.Empty;

					newUrl = GetUrlWithoutFile( originalUrl );

					if ( url[ 0 ] != '/' )
						newUrl += '/';
					
					newUrl += url;

					return newUrl;
				}
				else
					return url;
			}

			private bool Crawl(string url, int level)
			{
				try
				{
					// HTML unicode string
					string html = string.Empty;

					// Encoding of the HTML 
					System.Text.Encoding encoding = null;

					bool result = false;
					byte[] buffer = null;

					string contentType = string.Empty;

					// Get html from site
					result = GetHTMLFromSite(url, ref html, out encoding, string.Empty, out buffer, ref contentType);

					// Create a XMLDocument object
					XmlDocument xmlDoc = null;

					// Create logger, leave the file name empty if no log is needed
					Noviway.IO.Logger logger = new Noviway.IO.Logger(string.Empty);

					// Create the HTML parser
					Noviway.HTMLParser.Parser parser = new Noviway.HTMLParser.Parser(logger, html, m_HtmlToXmlPassword);

					// Process the data
					parser.Process();

					// Create our special HTML document structure
					Noviway.HTMLParser.HTMLDocument doc = new Noviway.HTMLParser.HTMLDocument(logger, parser.Tags, encoding);

					// Process our HTML document
					result = doc.Process();

					// Now we create the XML document
					result = doc.CreateXml(out xmlDoc);
					
					

					WebPage page = new WebPage();

					page.PageEncoding = encoding;
					page.Url = url;
					page.XML = xmlDoc;
					page.HTML = (m_SaveHTML) ? html : string.Empty;
					page.Buffer = (m_SaveBuffer) ? buffer : null;
					page.ContentType = contentType;

					

					// Find all descendent anchor tag <A> 
					ArrayList list = doc.GetElementsByTagName("a");

					string href = string.Empty;

                    SortedList emailList = new SortedList();
					
					// Mark this url
					m_UrlVisited[url] = true;

                    // We have the page, fire event
                    if (m_NewPageEvent != null)
                        m_NewPageEvent(page, level);

					foreach (XmlElement element in list)
					{
						try
						{
							href = element.Attributes["href"].Value.ToLower();

							if ( href.StartsWith("javascript:") )
								continue;

                            if (href.StartsWith("mailto:"))
                                continue;

							// Get absolute url
							href = GetAbsoluteUrl( url, href );

							if ( level < m_MaxCrawlingLevels && href != url && m_UrlVisited[href] == null && ( !m_DontLeaveSite || ( m_DontLeaveSite && GetDomain( href ).ToLower() == m_Domain ) ) )
								Crawl(href, level + 1);
						}
						catch { }
					}



					return true;

				}
				catch( Exception exp )
				{
					m_Logger.WriteLog( exp );
					return false;
				}

			}

			/// <summary>
			/// Start crawling
			/// </summary>
			/// <param name="url">Desired url</param>
			/// <param name="xmlPages">Array of XML pages</param>
			/// <returns></returns>
			public bool Crawl( string url )
			{
				m_Domain = GetDomain(url).ToLower();

				return Crawl(url, 1);            
			}


            /// <summary>
            /// Translate email address
            /// </summary>
            private static bool TranslateEmail(string email, ref string userName, ref string domainName, ref string domainExt)
            {
                try
                {
                    // Make sure it's a valid email
                    int atPos = email.IndexOf('@');


                    if (atPos > 0)
                    {
                        userName = email.Substring(0, atPos);

                        string domain = email.Substring(atPos + 1);

                        int dotPos = -1;

                        // Make sure '@' is not at the end
                        if (domain.Length == 0)
                            return false;

                        dotPos = domain.IndexOf('.');

                        if (dotPos == -1 || dotPos == 0 || dotPos == domain.Length - 1)
                            return false;

                        domainName = domain.Substring(0, dotPos);
                        domainExt = domain.Substring(dotPos + 1);

                        if (domainExt.Length == 1)
                            return false;

                    }
                    else
                        return false;

                    return true;
                }
                catch
                {
                    return false;
                }
            }

            /// <summary>
            /// Extract email address
            /// </summary>
            public static bool ExtractEmailAddresses( string content, ref ArrayList emailList )
            {
                const string GOOD_CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_-.@";
                SortedList emails = new SortedList();

                try
                {
                    int beginPos, endPos, pos, lastPos = 0;

                    while (true)
                    {
                        pos = content.IndexOf('@', lastPos);

                        if (pos == -1)
                            break;

                        for (beginPos = pos - 1; beginPos >= 0; beginPos--)
                        {
                            if (GOOD_CHARS.IndexOf(content[beginPos]) == -1)
                                break;
                        }


                        for (endPos = pos + 1; endPos < content.Length; endPos++)
                        {
                            if (GOOD_CHARS.IndexOf(content[endPos]) == -1)
                                break;
                        }

                        string email = content.Substring(beginPos + 1, endPos - beginPos - 1).ToLower();

                        string userName = string.Empty;
                        string domainName = string.Empty;
                        string domainExt = string.Empty;

                        // Check the email
                        if (TranslateEmail(email, ref userName, ref domainName, ref domainExt))
                            emails[email.Trim().ToLower()] = true;

                        lastPos = endPos;
                    }

                    foreach (string email in emails.Keys)
                        emailList.Add(email);

                    return true;
                }
                catch
                {
                    return false;
                }
            }
		}
	}

}
💿 文件大小 214 K
👤 上传用户 abc171abc171
📂 所属分类多国语言处理
🏷️ 相关标签

#web #automated #crawler #program
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -