⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 webspider.cs

📁 一系列实验参考资料,希望对大家有点参考价值.是用JAVA编写的
💻 CS
字号:
using System;
using System.Collections;
using System.Text.RegularExpressions;
using System.IO;
using System.Net;
using System.Windows.Forms;

using Mf.Util;

namespace Mf.Service.WebSpider
{
   /// <summary>
   /// Summary description for Spider.
   /// </summary>
   public class WebSpider
   {
      private        Uri      	      m_startUri;
      private        Uri   		      m_baseUri;
      private        int	            m_uriProcessedCountMax;
      private        int			      m_uriProcessedCount;
      private        bool              m_keepWebContent;

      private        Queue    	      m_webPagesPending;
      private        Hashtable	      m_webPages;
      private        IWebPageProcessor m_webPageProcessor;

      public WebSpider( string startUri ) : 
         this ( startUri, -1 ) { }

      public WebSpider( string startUri, int uriProcessedCountMax ) : 
         this ( startUri, "", uriProcessedCountMax, false, new WebPageProcessor( ) ) { }

      public WebSpider( string startUri, string baseUri, int uriProcessedCountMax ) : 
         this ( startUri, baseUri, uriProcessedCountMax, false, new WebPageProcessor( ) ) { }

      public WebSpider( string startUri, string baseUri, int uriProcessedCountMax, bool keepWebContent, IWebPageProcessor webPageProcessor )
      {
         StartUri 	               = new Uri( startUri );
         
         // In future this could be null and will process cross-site, but for now must exist
         BaseUri  	               = new Uri( Is.EmptyString( baseUri ) ? m_startUri.GetLeftPart( UriPartial.Authority ) : baseUri );

         UriProcessedCountMax       = uriProcessedCountMax;
         KeepWebContent             = keepWebContent;

         m_webPagesPending          = new Queue( );
         m_webPages                 = new Hashtable( );

         m_webPageProcessor         = webPageProcessor;

         m_webPageProcessor.ContentHandler += new WebPageContentDelegate( this.HandleLinks );
      }

      #region public interface
      public void Execute( )
      {
         UriProcessedCount = 0;

         DateTime start = DateTime.Now;

         Console.WriteLine( "======================================================================================================" );
         Console.WriteLine( "Proccess URI: " + m_startUri.AbsoluteUri );
         Console.WriteLine( "Start At    : " + start );
         Console.WriteLine( "------------------------------------------------------------------------------------------------------" );

         AddWebPage( StartUri, StartUri.AbsoluteUri );

         try
         {
            while ( WebPagesPending.Count > 0 && 
               ( UriProcessedCountMax == -1 || UriProcessedCount < UriProcessedCountMax ) )
            {
               Console.WriteLine( "Max URI's: {0}, Processed URI's: {1}, Pending URI's: {2}", UriProcessedCountMax, UriProcessedCount, WebPagesPending.Count );

               WebPageState state = (WebPageState)m_webPagesPending.Dequeue( );

               m_webPageProcessor.Process( state );

               if ( ! KeepWebContent )
               {
                  state.Content = null;
               }

               UriProcessedCount++;
            }
         }
         catch (Exception ex)
         {
            Console.WriteLine( "Failure while running web spider: " + ex.ToString( ) );
         }

         DateTime end = DateTime.Now;
         float elasped = (end.Ticks - start.Ticks)/10000000;

         Console.WriteLine( "------------------------------------------------------------------------------------------------------" );
         Console.WriteLine( "URI Finished   : " + m_startUri.AbsoluteUri );
         Console.WriteLine( "Pages Processed: " + UriProcessedCount );
         Console.WriteLine( "Pages Pending  : " + WebPagesPending.Count );
         Console.WriteLine( "End At         : " + end );
         Console.WriteLine( "Elasped Time   : {0} seconds", elasped );
         Console.WriteLine( "======================================================================================================" );
      }

      public void HandleLinks( WebPageState state )
      {
         if ( state.ProcessInstructions.IndexOf( "Handle Links" ) != -1 )
         {
            int   counter  = 0;
            Match m        = RegExUtil.GetMatchRegEx( RegularExpression.UrlExtractor, state.Content );

            while( m.Success )
            {
               if ( AddWebPage( state.Uri, m.Groups["url"].ToString( ) ) )
               {
                  counter++;
               }

               m = m.NextMatch( );
            }

            Console.WriteLine( "           : {0} new links were added", counter );
         }
      }

      private bool AddWebPage( Uri baseUri, string newUri )
      {
         // Remove any anchors
         string   url      = StrUtil.LeftIndexOf( newUri, "#" );   

         // Construct a Uri, using the current page Uri as a base reference
         Uri      uri      = new Uri( baseUri, url );

         if ( ! ValidPage( uri.LocalPath ) || m_webPages.Contains( uri ) )
         {
            return false;
         }
         WebPageState state = new WebPageState( uri );

         // Only process links for pages within the same site. 
         if ( uri.AbsoluteUri.StartsWith( BaseUri.AbsoluteUri ) )
         {
            state.ProcessInstructions += "Handle Links";
         }

         m_webPagesPending.Enqueue  ( state );
         m_webPages.Add             ( uri, state );
         
         return true;
      }
      #endregion

      #region local interface
      private static readonly string[] m_validPageExtensions = new string[]
            {"html", "php", "asp", "htm", "jsp", "shtml", "php3", "aspx", "pl", "cfm" };

      // A page is considered valid if there is no extension or if the
      // last character is a /.  If there is an extension then the page
      // is considered valid if that extension is classed as valid.
      private bool ValidPage( string path ) 
      { 
         int   pos      = path.IndexOf( "." );

         if ( pos == -1 || path[ path.Length-1 ] == 47 ) /*.ToString( ).Equals( "/" )*/ 
         {
            return true;
         }

         string uriExt = StrUtil.RightOf( path, pos ).ToLower( );

         // Uri ends in an extension
         foreach ( string ext in m_validPageExtensions )
         {
            if ( uriExt.Equals( ext ) )
            {
               return true;
            }
         }

         return false;
      }
      #endregion

      #region properties
      public IWebPageProcessor WebPageProcessor
      {
         get
         {
            return m_webPageProcessor;
         }
         set
         {  
            m_webPageProcessor = value;
         }
      }
      public Uri StartUri
      {
         get
         {
            return m_startUri;
         }
         set
         {  
            m_startUri = value;
         }
      }
      public Uri BaseUri
      {
         get
         {
            return m_baseUri;
         }
         set
         {  
            m_baseUri = value;
         }
      }
      private int UriProcessedCount
      {
         get
         {
            return m_uriProcessedCount;
         }
         set
         {
            m_uriProcessedCount = value;
         }
      }
      public int UriProcessedCountMax
      {
         get
         {
            return m_uriProcessedCountMax;
         }
         set
         {  
            m_uriProcessedCountMax = value;
         }
      }
      public bool KeepWebContent
      {
         get
         {
            return m_keepWebContent;
         }
         set
         {  
            m_keepWebContent = value;
         }
      }
      public Hashtable WebPages
      {
         get
         {
            return m_webPages;
         }
         set
         {
            m_webPages = value;
         }
      }
      private Queue WebPagesPending
      {
         get
         {
            return m_webPagesPending;
         }
         set
         {
            m_webPagesPending = value;
         }
      }
      #endregion
   }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -