📄 spider.cs
字号:
using System;
using System.Collections;
using System.Net;
using System.IO;
using System.Threading;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using ShootSearch.Logging;
namespace ShootSearch.Spiders.http
{
/// <summary>
/// The main class for the spider. This spider can be used with the
/// SpiderForm form that has been provided. The spider is completely
/// selfcontained. If you would like to use the spider with your own
/// application just remove the references to m_spiderForm from this file.
///
/// The files needed for the spider are:
///
/// Attribute.cs - Used by the HTML parser
/// AttributeList.cs - Used by the HTML parser
/// DocumentWorker - Used to "thread" the spider
/// Done.cs - Allows the spider to know when it is done
/// Parse.cs - Used by the HTML parser
/// ParseHTML.cs - The HTML parser
/// Spider.cs - This file
/// SpiderForm.cs - Demo of how to use the spider
///
/// This spider is copyright 2003 by Jeff Heaton. However, it is
/// released under a Limited GNU Public License (LGPL). You may
/// use it freely in your own programs. For the latest version visit
/// http://www.jeffheaton.com.
///
/// </summary>
public class Spider : SpiderMan
{
/// <summary>
/// The URL's that have already been processed.
/// </summary>
private Hashtable m_already;
/// <summary>
/// URL's that are waiting to be processed.
/// </summary>
private Queue m_workload;
/// <summary>
/// The first URL to spider. All other URL's must have the
/// same hostname as this URL.
/// </summary>
private Uri m_base;
/// <summary>
/// The directory to save the spider output to.
/// </summary>
private string m_outputPath;
/// <summary>
/// How many URL's has the spider processed.
/// </summary>
private int m_urlCount = 0;
/// <summary>
/// When did the spider start working
/// </summary>
private long m_startTime = 0;
/// <summary>
/// Used to keep track of when the spider might be done.
/// </summary>
private Done m_done = new Done();
/// <summary>
/// Used to tell the spider to quit.
/// </summary>
private bool m_quit;
/// <summary>
/// The status for each URL that was processed.
/// </summary>
enum Status { STATUS_FAILED, STATUS_SUCCESS, STATUS_QUEUED };
/// <summary>
/// The path for Index files.
/// </summary>
private string m_indexpath;
private ArrayList arrThreads;
/// <summary>
/// The constructor
/// </summary>
public Spider()
{
reset();
}
/// <summary>
/// Call to reset from a previous run of the spider
/// </summary>
public void reset()
{
m_already = new Hashtable();
m_workload = new Queue();
m_quit = false;
}
/// <summary>
/// Add the specified URL to the list of URI's to spider.
/// This is usually only used by the spider, itself, as
/// new URL's are found.
/// </summary>
/// <param name="uri">The URI to add</param>
public void addURI(Uri uri)
{
Monitor.Enter(this);
if( !m_already.Contains(uri.ToString().Trim()) )
{
m_already.Add(uri.ToString(),Status.STATUS_QUEUED);
m_workload.Enqueue(uri);
}
Monitor.Pulse(this);
Monitor.Exit(this);
}
/// <summary>
/// The URI that is to be spidered
/// </summary>
public Uri BaseURI
{
get
{
return m_base;
}
set
{
m_base = value;
}
}
/// <summary>
/// The local directory to save the spidered files to
/// </summary>
public string OutputPath
{
get
{
return m_outputPath;
}
set
{
m_outputPath = value;
}
}
/// <summary>
/// Set to true to request the spider to quit.
/// </summary>
public bool Quit
{
get
{
return m_quit;
}
set
{
m_quit = value;
}
}
/// <summary>
/// Used to determine if the spider is done,
/// this object is usually only used internally
/// by the spider.
/// </summary>
public Done SpiderDone
{
get
{
return m_done;
}
}
public string IndexPath
{
get
{
return m_indexpath;
}
set
{
m_indexpath = value;
}
}
/// <summary>
/// Called by the worker threads to obtain a URL to
/// to process.
/// </summary>
/// <returns>The next URL to process.</returns>
public Uri ObtainWork()
{
Monitor.Enter(this);
while(m_workload.Count<1)
{
Monitor.Wait(this);
}
Uri next = (Uri)m_workload.Dequeue();
Log.ProcessURL(next.ToString());
Log.m_Console.SetProcessedCount(""+(m_urlCount++));
long etime = (System.DateTime.Now.Ticks-m_startTime)/10000000L;
long urls = (etime==0)?0:m_urlCount/etime;
Log.m_Console.SetElapsedTime( etime/60 + " minutes (" + urls +" urls/sec)" );
Monitor.Exit(this);
return next;
}
/// <summary>
/// Start the spider.
/// </summary>
/// <param name="baseURI">The base URI to spider</param>
/// <param name="threads">The number of threads to use</param>
public void Start(Uri baseURI,int threads)
{
// init the spider
m_quit = false;
m_base = baseURI;
addURI(m_base);
m_startTime = System.DateTime.Now.Ticks;;
m_done.Reset();
arrThreads = new ArrayList();
// startup the threads
for(int i=1;i<threads;i++)
{
DocumentWorker worker = new DocumentWorker(this);
worker.Number = i;
arrThreads.Add(worker);
worker.start();
}
// now wait to be done
m_done.WaitBegin();
m_done.WaitDone();
}
public void AcceptJob(string p_baseUrl,int threads)
{
//try
//{
OutputPath = ShootSearch.Helper.Config.CacheDirectory;
Start(new Uri(p_baseUrl) , threads);
//}
//catch{}
}
public void StopWork()
{
;
this.Quit=true;
for(int i = 0 ; i< arrThreads.Count ; i++)
{
((DocumentWorker)arrThreads[i]).StopWork();
}
}
public void setIndexPath(string p_strPath)
{
this.IndexPath = p_strPath;
}
public bool IsStoped()
{
return this.Quit;
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -