📄 documentworker.cs
字号:
using System;
using System.Net;
using System.IO;
using System.Threading;
using System.Text;
using ShootSearch.IFilter;
using ShootSearch.Plugin;
using ShootSearch.Indexing;
using ShootSearch.Logging;
using System.Security.Cryptography;
using ShootSearch.Helper;
using System.Text.RegularExpressions;
namespace ShootSearch.Spiders.http
{
/// <summary>
/// Perform all of the work of a single thread for the spider.
/// This involves waiting for a URL to becomve available, download
/// and then processing the page.
///
/// </summary>
// 完成必须由单个工作线程执行的操作,包括
// 等待可用的URL,下载和处理页面
public class DocumentWorker
{
/// <summary>
/// The base URI that is to be spidered.
/// </summary>
// 要扫描的基础URI
private Uri m_uri;
/// <summary>
/// The spider that this thread "works for"
/// </summary>
//
private Spider m_spider;
/// <summary>
/// The thread that is being used.
/// </summary>
private Thread m_thread;
/// <summary>
/// The thread number, used to identify this worker.
/// </summary>
// 线程编号,用来标识当前的工作线程
private int m_number;
/// <summary>
/// The name for default documents.
/// </summary>
// 缺省文档的名字
public const string IndexFile = "index.html";
/// <summary>
/// Constructor.
/// </summary>
/// <param name="spider">The spider that owns this worker.</param>
// 构造函数,参数表示拥有当前工作线程的蜘蛛程序
public DocumentWorker(Spider spider)
{
m_spider = spider;
}
private Encoding encoding;
#region Convert uri to filename
/// <summary>
/// This method will take a URI name, such ash /images/blank.gif
/// and convert it into the name of a file for local storage.
/// If the directory structure to hold this file does not exist, it
/// will be created by this method.
/// </summary>
/// <param name="uri">The URI of the file about to be stored</param>
/// <returns></returns>
// 输入参数是一个URI名称,例如/images/blank.gif.
// 把它转换成本地文件名称。如果尚未创建相应的目录
// 结构,则创建之
private string convertFilename(Uri uri , bool p_Encode)
{
string result = m_spider.OutputPath + "\\" + uri.Host.Replace(".","_");
//处理动态网页时需要后缀名的转换
uri = new Uri(System.Web.HttpUtility.UrlDecode(uri.ToString()));
string fileparameter = uri.Query;
fileparameter = Tools.formatFilename(fileparameter);
int index1;
int index2;
// add ending slash if needed
if( result[result.Length-1]!='\\' )
result = result+"\\";
// strip the query if needed
String path = uri.PathAndQuery;
int queryIndex = path.IndexOf("?");
if( queryIndex!=-1 )
path = path.Substring(0,queryIndex);
// see if an ending / is missing from a directory only
int lastSlash = path.LastIndexOf('/');
int lastDot = path.LastIndexOf('.');
if( path[path.Length-1]!='/' )
{
if(lastSlash>lastDot)
path+="/"+IndexFile;
}
// determine actual filename
lastSlash = path.LastIndexOf('/');
string filename = "";
if(lastSlash!=-1)
{
filename=path.Substring(1+lastSlash);
path = path.Substring(0,1+lastSlash);
if(filename.Equals("") )
filename=IndexFile;
}
// 必要时创建目录结构
index1 = 1;
do
{
index2 = path.IndexOf('/',index1);
if(index2!=-1)
{
String dirpart = path.Substring(index1,index2-index1);
result+=dirpart;
result+="\\";
Directory.CreateDirectory(result);
index1 = index2+1;
}
} while(index2!=-1);
// attach name
string newName;
newName = result + filename + fileparameter;
byte [] data = Encoding.ASCII.GetBytes(newName);
byte [] rt;
SHA1 sha = new SHA1CryptoServiceProvider();
rt = sha.ComputeHash(data);
if(p_Encode)
{
newName = Convert.ToBase64String(rt) ;
newName = Tools.formatFilename(newName);
result = result + newName + ".htm" ;
}
else
{
result = newName;
}
return result;
}
#endregion
#region Save a binary file to disk.
/// <summary>
/// Save a binary file to disk.
/// </summary>
/// <param name="response">The response used to save the file</param>
// 将二进制文件保存到磁盘
private void SaveBinaryFile(WebResponse response)
{
byte []buffer = new byte[1024];
if( m_spider.OutputPath==null )
return;
string filename = convertFilename( response.ResponseUri ,false);
string fileExtension = filename.Substring(filename.LastIndexOf("."));
if(!Config.FileTypes.Contains(fileExtension.ToUpper()))
{
//如果文件列表不包含这个文件类型
//退出
//Console.WriteLine( "=======" + fileExtension.ToUpper() );
return ;
}
Stream outStream = File.Create( filename );
Stream inStream = response.GetResponseStream();
int l;
do
{
l = inStream.Read(buffer,0,buffer.Length);
if(l>0)
outStream.Write(buffer,0,l);
}
while(l>0);
outStream.Close();
inStream.Close();
IndexReptile.Add(m_uri,filename,Tools.GetUrlName(m_uri));
}
#endregion
#region Save a text file.
/// <summary>
/// Save a text file.
/// </summary>
/// <param name="buffer">The text to save</param>
// 保存文本文件
private void SaveTextFile(string buffer)
{
try
{
if( m_spider.OutputPath==null )
return;
string filename =m_uri.ToString().Substring(m_uri.ToString().Length -4).ToLower();
if(filename == ".css" || filename == ".xml")
filename = convertFilename( m_uri , false);
else
filename = convertFilename( m_uri , true);
//buffer=StripHTML(buffer);
//Console.WriteLine(buffer);
StreamWriter outStream = new StreamWriter( filename ,false ,encoding );
outStream.Write(buffer);
outStream.Close();
IndexReptile.Add(m_uri,filename,getTitle(buffer));
}
catch{}
}
#endregion
#region Download a page
/// <summary>
/// Download a page
/// </summary>
/// <returns>The data downloaded from the page</returns>
// 下载一个页面
private string GetPage()
{
// GetPageCode gpc = new GetPageCode();
// gpc.Url=m_uri.ToString();
// gpc.ProxyState=0;//使用代理服务器,0为不使用,设置为1后下面的代理设置才起作用
// //gpc.OutFilePath=filePath;//设置输出文件路径的地方,如果不设置,则返回字符串
// gpc.GetSource();//处理
// Console.WriteLine(gpc.OutString);
// return gpc.OutString;
#region
WebResponse response = null;
Stream stream = null;
StreamReader reader = null;
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(m_uri);
request.Timeout = 60000;
response = request.GetResponse();
stream = response.GetResponseStream();
if( !response.ContentType.ToLower().StartsWith("text/") )
{
SaveBinaryFile(response);
return null;
}
reader = new StreamReader(stream,System.Text.Encoding.Default);
string buffer ="" ;// = reader.ReadToEnd();//,line;
string line ;
while( (line = reader.ReadLine())!=null )
{
buffer+=line+"\r\n";
}
string strEncoding = Tools.GetEncoding(buffer);
//System.Text.Encoding encoding;
if(strEncoding == "UTF-8")
encoding = Encoding.UTF8 ;
else if (strEncoding == "UTF-7")
encoding = Encoding.UTF7 ;
else if (strEncoding == "UNICODE")
encoding = Encoding.Unicode;
else
encoding = Encoding.Default;
if (encoding != Encoding.Default)
{
request.Timeout = 60000;
request = (HttpWebRequest)WebRequest.Create(m_uri);
response = request.GetResponse();
stream = response.GetResponseStream();
reader = new StreamReader(stream,encoding);
buffer=reader.ReadToEnd();
}
return buffer;
}
catch(WebException e)
{
Log.ResultURL("下载失败,错误:" + e);
return null;
}
catch(IOException e)
{
Log.ResultURL("下载失败,错误:" + e);
return null;
}
finally
{
if( reader!=null ) reader.Close();
if( stream!=null ) stream.Close();
if( response!=null ) response.Close();
}
#endregion
}
#endregion
#region ProcessLink
/// <summary>
/// Process each link encountered. The link will be recorded
/// for later spidering if it is an http or https docuent,
/// has not been visited before(determined by spider class),
/// and is in the same host as the original base URL.
/// </summary>
/// <param name="link">The URL to process</param>
private string ProcessLink(string link)
{
Uri url;
Log.Debug(link);
if(link == "#")
return link;
while(link.LastIndexOf("#") > link.LastIndexOf("/") )
{
link = link.Substring(0,link.LastIndexOf("#"));
}
link= link.Replace("&","&");
// fully expand this URL if it was a relative link
try
{
//if(link.IndexOf("http")>=0 && !Config.OnlyStartURL)
//{
// url = new Uri(link);
//}
//else
{
url = new Uri(m_uri,link,true);
}
}
catch(UriFormatException e)
{
Log.ResultURL( "Invalid URI:" + link +" Error:" + e.Message);
return link;
}
if(!url.Scheme.ToLower().Equals("http") &&
!url.Scheme.ToLower().Equals("https") )
return link;
// comment out this line if you would like to spider
// the whole Internet (yeah right, but it will try)
if(!Config.OnlyStartURL)
if( !url.Host.ToLower().Equals( m_uri.Host.ToLower() ) )
return link;
//System.Console.WriteLine( "Queue:"+url );
//Console.WriteLine(url.ToString() + "==" + (this.m_spider.SpiderName));
if(url.ToString().IndexOf(this.m_spider.BaseURI.ToString()) >= 0)
{
m_spider.addURI( url );
}
else
{
Log.Debug(url);
}
return url.ToString();
}
#endregion
#region ProcessPage
/// <summary>
/// Process a URL
/// </summary>
/// <param name="page">the URL to process</param>
private string ProcessPage(string page)
{
ParseHTML parse = new ParseHTML();
parse.Source = page;
while(!parse.Eof())
{
char ch = parse.Parse();
if(ch==0)
{
Attribute a = parse.GetTag()["HREF"];
if( a!=null )
{
string newValue = ProcessLink(a.Value);
//Console.WriteLine(parse.AdvanceCurrentChar());
//string oldTag = parse.BuildTag();
//string newTag = oldTag.Replace(a.Value,newValue);
page = page.Replace( "\"" + a.Value + "\"" , "\"" +newValue + "\"" );
page = page.Replace( "'" + a.Value + "'" , "'" +newValue + "'" );
page = page.Replace( "=" + a.Value , "=" +newValue );
}
a = parse.GetTag()["SRC"];
if( a!=null )
{
string newValue = ProcessLink(a.Value);
//Console.WriteLine(parse.AdvanceCurrentChar());
//string oldTag = parse.BuildTag();
//string newTag = oldTag.Replace(a.Value,newValue);
page = page.Replace( "\"" + a.Value + "\"" , "\"" +newValue + "\"" );
page = page.Replace( "'" + a.Value + "'" , "'" +newValue + "'" );
page = page.Replace( "=" + a.Value , "=" +newValue );
//Console.WriteLine(oldTag + "0000000000" + newTag);
}
//newpage += parse.BuildTag();
//Console.WriteLine(parse.Source);
}
}
return page;
}
#endregion
#region Process()
/// <summary>
/// This method is the main loop for the spider threads.
/// This method will wait for URL's to become available,
/// and then process them.
/// </summary>
public void Process()
{
while(!m_spider.Quit )
{
m_uri = m_spider.ObtainWork();
m_spider.SpiderDone.WorkerBegin();
//if(m_spider.ReportTo !=null)
// m_spider.ReportTo.SetLastURL("Download("+this.Number+"):"+m_uri);
Log.ResultURL("Download("+this.Number+"):"+m_uri);
string page = GetPage();
if(page!=null)
{
page = ProcessPage(page);
SaveTextFile(page);
}
m_spider.SpiderDone.WorkerEnd();
}
while(m_spider.Quit)
Thread.CurrentThread.Abort();
}
#endregion
#region Start the thread.
/// <summary>
/// Start the thread.
/// </summary>
public void start()
{
ThreadStart ts = new ThreadStart( this.Process );
m_thread = new Thread(ts);
m_thread.Start();
m_thread.Name = "Shoot-Spider";
}
#endregion
#region The thread number. Used only to identify this thread.
/// <summary>
/// The thread number. Used only to identify this thread.
/// </summary>
public int Number
{
get
{
return m_number;
}
set
{
m_number = value;
}
}
#endregion
#region Finds a title of HTML file.
/// <summary>
/// Finds a title of HTML file. Doesn't work if the title spans two or more lines.
/// </summary>
/// <param name="html">HTML document source</param>
/// <returns>Title string.</returns>
private string getTitle(string html)
{
Match m = Regex.Match(html, "<title>(.*)</title>");
if (m.Groups.Count == 2)
return m.Groups[1].Value;
return "(unknown)";
}
#endregion
#region StopWork
/// <summary>
/// StopWork
/// </summary>
public void StopWork()
{
try
{
this.m_thread.Abort();
}
catch
{}
}
#endregion
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -