📄 crawler.cs
字号:
using System;
using System.Text;
using System.Xml;
using System.IO;
using System.Collections;
using System.Net;
namespace Noviway
{
namespace WebCrawler
{
public class Crawler
{
private static void __NewPageEvent(Crawler.WebPage page, int level)
{
string content = page.HTML;
ArrayList emails = new ArrayList();
Crawler.ExtractEmailAddresses(content, ref emails);
foreach (string email in emails)
Console.WriteLine(email);
}
static void Main()
{
Noviway.WebCrawler.Crawler crawler = new Noviway.WebCrawler.Crawler( Noviway.IO.Logger.Empty);
crawler.MaxCrawlingLevels = 2;
crawler.SaveBuffer = true;
crawler.SaveHTML = true;
crawler.DontLeaveSite = true;
crawler.NewPageEvent = new Crawler.NewPageCallback(__NewPageEvent);
crawler.Crawl("http://www.noviway.com");
}
/// <summary>
/// A new page has been found
/// </summary>
public delegate void NewPageCallback(WebPage page, int level);
/// <summary>
/// Web page
/// </summary>
public class WebPage
{
/// <summary>
/// Url of the page
/// </summary>
public string Url;
/// <summary>
/// Xml version of this page
/// </summary>
public XmlDocument XML;
/// <summary>
/// Original page buffer
/// </summary>
public byte[] Buffer;
/// <summary>
/// HTML content
/// </summary>
public string HTML;
/// <summary>
/// Encoding of the page
/// </summary>
public Encoding PageEncoding;
/// <summary>
/// Content type of the page
/// </summary>
public string ContentType;
public WebPage()
{
}
public WebPage(string url, XmlDocument document, byte[] buffer, Encoding encoding, string html, string contentType)
{
this.Url = url;
this.XML = document;
this.Buffer = buffer;
this.PageEncoding = encoding;
this.HTML = html;
this.ContentType = contentType;
}
}
/// <summary>
/// We only visit a page once
/// </summary>
private Hashtable m_UrlVisited = new Hashtable();
/// <summary>
/// Password for the Html To Xml componenet
/// </summary>
private readonly string m_HtmlToXmlPassword;
/// <summary>
/// HTTP browser, we use it globally so we can use the cookie and save up time
/// </summary>
Noviway.HTTPBrowser.Browser m_Browser = new Noviway.HTTPBrowser.Browser();
/// <summary>
/// Domain
/// </summary>
private string m_Domain = string.Empty;
private readonly Noviway.IO.Logger m_Logger;
#region Rules
private int m_MaxCrawlingLevels = 3;
private bool m_SaveBuffer = false;
private bool m_SaveHTML = false;
private bool m_DontLeaveSite = true;
#endregion
private NewPageCallback m_NewPageEvent = null;
/// <summary>
/// Don't leave site
/// </summary>
public bool DontLeaveSite
{
set
{
m_DontLeaveSite = value;
}
get
{
return m_DontLeaveSite;
}
}
/// <summary>
/// New page event
/// </summary>
public NewPageCallback NewPageEvent
{
set
{
m_NewPageEvent = value;
}
}
/// <summary>
/// Maximum crawling levels
/// </summary>
public int MaxCrawlingLevels
{
get
{
return m_MaxCrawlingLevels;
}
set
{
m_MaxCrawlingLevels = value;
}
}
/// <summary>
/// Should we save HTML
/// </summary>
public bool SaveHTML
{
get
{
return m_SaveHTML;
}
set
{
m_SaveHTML = value;
}
}
/// <summary>
/// Should we save the buffer
/// </summary>
public bool SaveBuffer
{
get
{
return m_SaveBuffer;
}
set
{
m_SaveBuffer = value;
}
}
/// <summary>
/// Crawler
/// </summary>
public Crawler( Noviway.IO.Logger logger)
{
m_Logger = logger;
}
public Crawler( string html2XmlPassword )
{
m_HtmlToXmlPassword = html2XmlPassword;
}
/// <summary>
/// Get HTML from a website
/// </summary>
bool GetHTMLFromSite(string url, ref string html, out System.Text.Encoding encoding, string saveFileName, out byte[] buffer, ref string contentType )
{
encoding = null;
buffer = null;
bool result = false;
// The memory stream will contain the binary HTML
// It's faster to use the MemoryStream object inside because we don't fetch the data at one time but we make
// a few fetch calls
MemoryStream memStream = new MemoryStream();
HttpWebResponse response = null;
// Navigate to the desired location
result = m_Browser.Navigate(new Noviway.HTTPBrowser.Browser.Stage("GET", url, string.Empty), ref memStream, out response);
if (!result)
return false;
if ( response != null )
contentType = response.ContentType;
// Only now, convert to byte array
buffer = memStream.ToArray();
// Get a unicode string instead of the buffer. The encoding of the HTML does not matter, we take care
// of it inside.
result = Noviway.HTTPBrowser.Browser.GetHTMLString(buffer, out encoding, ref html);
// Close stream
memStream.Close();
if (saveFileName.Length > 0)
{
FileStream fs = File.Create(saveFileName);
fs.Write(buffer, 0, buffer.Length);
fs.Close();
}
return result;
}
/// <summary>
/// Get url without the file
/// </summary>
public static string GetUrlWithoutFile( string url )
{
int lastSlashPos = url.LastIndexOf('/');
string newUrl = string.Empty;
if ( lastSlashPos == -1 )
return url;
if (lastSlashPos > 7)
newUrl = url.Substring(0, lastSlashPos);
else
newUrl = url;
return newUrl;
}
/// <summary>
/// Get domain
/// </summary>
public static string GetDomain( string url )
{
int pos = url.LastIndexOf("://");
if ( pos != -1 )
url = url.Substring( pos + 3 );
pos = url.IndexOf('/');
if ( pos == -1 )
return url;
return url.Substring( 0, pos );
}
/// <summary>
/// Get absolute url
/// </summary>
public static string GetAbsoluteUrl( string originalUrl, string url )
{
if ( !url.StartsWith("http://") )
{
string newUrl = string.Empty;
newUrl = GetUrlWithoutFile( originalUrl );
if ( url[ 0 ] != '/' )
newUrl += '/';
newUrl += url;
return newUrl;
}
else
return url;
}
private bool Crawl(string url, int level)
{
try
{
// HTML unicode string
string html = string.Empty;
// Encoding of the HTML
System.Text.Encoding encoding = null;
bool result = false;
byte[] buffer = null;
string contentType = string.Empty;
// Get html from site
result = GetHTMLFromSite(url, ref html, out encoding, string.Empty, out buffer, ref contentType);
// Create a XMLDocument object
XmlDocument xmlDoc = null;
// Create logger, leave the file name empty if no log is needed
Noviway.IO.Logger logger = new Noviway.IO.Logger(string.Empty);
// Create the HTML parser
Noviway.HTMLParser.Parser parser = new Noviway.HTMLParser.Parser(logger, html, m_HtmlToXmlPassword);
// Process the data
parser.Process();
// Create our special HTML document structure
Noviway.HTMLParser.HTMLDocument doc = new Noviway.HTMLParser.HTMLDocument(logger, parser.Tags, encoding);
// Process our HTML document
result = doc.Process();
// Now we create the XML document
result = doc.CreateXml(out xmlDoc);
WebPage page = new WebPage();
page.PageEncoding = encoding;
page.Url = url;
page.XML = xmlDoc;
page.HTML = (m_SaveHTML) ? html : string.Empty;
page.Buffer = (m_SaveBuffer) ? buffer : null;
page.ContentType = contentType;
// Find all descendent anchor tag <A>
ArrayList list = doc.GetElementsByTagName("a");
string href = string.Empty;
SortedList emailList = new SortedList();
// Mark this url
m_UrlVisited[url] = true;
// We have the page, fire event
if (m_NewPageEvent != null)
m_NewPageEvent(page, level);
foreach (XmlElement element in list)
{
try
{
href = element.Attributes["href"].Value.ToLower();
if ( href.StartsWith("javascript:") )
continue;
if (href.StartsWith("mailto:"))
continue;
// Get absolute url
href = GetAbsoluteUrl( url, href );
if ( level < m_MaxCrawlingLevels && href != url && m_UrlVisited[href] == null && ( !m_DontLeaveSite || ( m_DontLeaveSite && GetDomain( href ).ToLower() == m_Domain ) ) )
Crawl(href, level + 1);
}
catch { }
}
return true;
}
catch( Exception exp )
{
m_Logger.WriteLog( exp );
return false;
}
}
/// <summary>
/// Start crawling
/// </summary>
/// <param name="url">Desired url</param>
/// <param name="xmlPages">Array of XML pages</param>
/// <returns></returns>
public bool Crawl( string url )
{
m_Domain = GetDomain(url).ToLower();
return Crawl(url, 1);
}
/// <summary>
/// Translate email address
/// </summary>
private static bool TranslateEmail(string email, ref string userName, ref string domainName, ref string domainExt)
{
try
{
// Make sure it's a valid email
int atPos = email.IndexOf('@');
if (atPos > 0)
{
userName = email.Substring(0, atPos);
string domain = email.Substring(atPos + 1);
int dotPos = -1;
// Make sure '@' is not at the end
if (domain.Length == 0)
return false;
dotPos = domain.IndexOf('.');
if (dotPos == -1 || dotPos == 0 || dotPos == domain.Length - 1)
return false;
domainName = domain.Substring(0, dotPos);
domainExt = domain.Substring(dotPos + 1);
if (domainExt.Length == 1)
return false;
}
else
return false;
return true;
}
catch
{
return false;
}
}
/// <summary>
/// Extract email address
/// </summary>
public static bool ExtractEmailAddresses( string content, ref ArrayList emailList )
{
const string GOOD_CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_-.@";
SortedList emails = new SortedList();
try
{
int beginPos, endPos, pos, lastPos = 0;
while (true)
{
pos = content.IndexOf('@', lastPos);
if (pos == -1)
break;
for (beginPos = pos - 1; beginPos >= 0; beginPos--)
{
if (GOOD_CHARS.IndexOf(content[beginPos]) == -1)
break;
}
for (endPos = pos + 1; endPos < content.Length; endPos++)
{
if (GOOD_CHARS.IndexOf(content[endPos]) == -1)
break;
}
string email = content.Substring(beginPos + 1, endPos - beginPos - 1).ToLower();
string userName = string.Empty;
string domainName = string.Empty;
string domainExt = string.Empty;
// Check the email
if (TranslateEmail(email, ref userName, ref domainName, ref domainExt))
emails[email.Trim().ToLower()] = true;
lastPos = endPos;
}
foreach (string email in emails.Keys)
emailList.Add(email);
return true;
}
catch
{
return false;
}
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -