📄 spider.cs
字号:
using System;
using System.Collections;
using System.Net;
using System.Threading;
using System.Text.RegularExpressions;
namespace spider
{
/// <summary>
/// Spider 的摘要说明。
/// </summary>
public class Spider
{ //private Uri BaseUrl;
private Queue OrUrl;
private Queue MomUrl;
public Queue outurl;
//private int Nthread;
private Hashtable ReadUrl;
private bool stop=false;
private int count=0;
//private Thread tr;
//private Hashtable UnReadUrl;
//=new Thread[Nthread-1];
#region 方法
public void Start(string ReUrl)
{OrUrl.Enqueue(ReUrl);
MomUrl.Enqueue(ReUrl);
/*BaseUrl=ReUrl.Host.ToLower();
Thread[] trs=new Thread[Nthread];
for(int i=0;i<Nthread;i++)
{ trs[i]=tr;
tr.Start();}*/
//tr=new Thread(new ThreadStart(this.Cycle));
Cycle(ReUrl);
}
private void Cycle(string ReUrl)
{ Uri urlproed,momurl;
url urlformat=new url(ReUrl);
string urlnow,pagehtml;
Http myhttp=new Http();
while(OrUrl.Count>0)
{if(stop)
break;
urlnow=OrUrl.Dequeue().ToString();
momurl=new Uri(MomUrl.Dequeue().ToString());
if(!ReadUrl.Contains(urlnow))
{
if(urlformat.ProcessUrl(urlnow,momurl)!=null)
{ urlproed=urlformat.ProcessUrl(urlnow,momurl);
pagehtml=myhttp.GetPage(urlproed);
if(pagehtml!=null)
{ ProsessPage(pagehtml,urlproed);
/* if(search!="")
{
if(SearchWord(search,pagehtml))
{
outurl.Enqueue(urlformat.ProcessUrl(urlnow));
}
}*/
ReadUrl.Add(urlnow,1);
}
else
{count++;
outurl.Enqueue(momurl+" "+urlproed.ToString());
}
}
}
}
stop=true;
}
public void Reset()
{
OrUrl=new Queue();
MomUrl=new Queue();
outurl=new Queue();
//Nthread=nths;
ReadUrl=new Hashtable();
//UnReadUrl=new Hashtable();
}
private void ProsessPage(string page,Uri momlink)
{
HtmlAnalysis Analysis=new HtmlAnalysis();
Analysis.Source=page;
while(!Analysis.IsEnd())
{
char ch=Analysis.Parse();
if(ch==0)
{
Attribute a=Analysis.GetTag()["HREF"];
if(a!=null)
{OrUrl.Enqueue(a.Value);
MomUrl.Enqueue(momlink);}
}
}
}
private bool SearchWord(string keyword,string sourse)
{
Regex myregex=new Regex(@keyword);
return myregex.IsMatch(sourse);
}
#endregion
#region 构造函数
public Spider()
{ Reset();
}
#endregion
#region 属性
public bool quit
{
get {return stop;}
set {stop=value;}
}
public int countnum
{
get {return count;}
set {count=value;}
}
#endregion
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -