⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 webspider.cs

📁 一系列实验参考资料,希望对大家有点参考价值.是用JAVA编写的
💻 CS
字号:
using System;
using System.Collections;
using System.Net;
using System.Threading;
using System.Windows.Forms;
using Mf.Util;
//using Mf._Test.Util;

namespace Mf.Service.WebSpider
{
   /// <summary>
   /// Summary description for Spider.
   /// </summary>
   public class WebSpider
   {
      public  const  int               MAX_THREADS = 20;

      private        int               m_threadCount;
      private        int               m_threadCountMax;

      private        Uri      	      m_startUri;
      private        Uri   		      m_baseUri;
      private        int	            m_uriProcessedCountMax;
      private        int			      m_uriProcessedCount;
      private        bool              m_saveWebContent;

      private        Queue    	      m_webPagesPending;
      private        Hashtable	      m_webPages;
      private        WebPageProcessor  m_processor;
      private        ManualResetEvent  m_resetEvent;

      public WebSpider( string startUri, string baseUri, int uriProcessedCountMax ) 
         : this ( startUri, baseUri, uriProcessedCountMax, 10, false )
      {
      }

      public WebSpider( string startUri, string baseUri, int uriProcessedCountMax, int threadCountMax, bool saveWebContent )
      {
         StartUri 	               = new Uri( startUri );
         BaseUri  	               = new Uri( Is.EmptyString( baseUri ) ? m_startUri.GetLeftPart( UriPartial.Authority ) : baseUri );

         UriProcessedCountMax       = uriProcessedCountMax;
         SaveWebContent             = saveWebContent;

         ThreadCountMax             = threadCountMax;
         
         // This should move down to execute, will need to wait for interface
         ThreadCount                = 0;  
         m_processor                = new WebPageProcessor( this );

         m_webPagesPending          = new Queue( );
         m_webPages                 = new Hashtable( );
      }

      #region public interface
      public void Execute( )
      {
         m_uriProcessedCount = 0;

         DateTime start = DateTime.Now;

         Log( "Start At: " + start );

         AddWebPage( m_startUri, m_startUri.AbsoluteUri );

         try
         {
            while ( m_webPagesPending.Count > 0 && m_uriProcessedCount < m_uriProcessedCountMax )
            {
//               Log( "MainLoop - Thread Count: {0}, PendingPages: {1}", m_threadCount, m_webPagesPending.Count );

               ResetEvent = new ManualResetEvent( false );

               while ( m_threadCount < m_threadCountMax && m_webPagesPending.Count > 0 )
               {
                  Log( "Inner-MainLoop - Thread Count: {0}, PendingPages: {1}, MAX: {2}", m_threadCount, m_webPagesPending.Count, m_threadCountMax );

                  ThreadPool.QueueUserWorkItem( 
                     new WaitCallback(m_processor.Process), m_webPagesPending.Dequeue( ) );

                  m_uriProcessedCount++;
                  m_threadCount++;

//                  Log( "Inner-MainLoopEnd - Thread Count: {0}, PendingPages: {1}", m_threadCount, m_webPagesPending.Count );
               }

               ResetEvent.WaitOne( Timeout.Infinite, true );

//               Log( "MainLoopEnd - Thread Count: {0}, PendingPages: {1}", m_threadCount, m_webPagesPending.Count );
            }

         }
         catch (NotSupportedException)
         {
            Log( "This class will fail on systems with limited thread support" );
         }

         DateTime end = DateTime.Now;

         float elasped = (end.Ticks - start.Ticks)/10000000;

         Log( "End At: " + end );
         Log( "Elasped Time: {0} seconds", elasped );

         Log( "Pages Processed: "   + UriProcessedCount );
         Log( "Pages Pending: "     + m_webPagesPending.Count );
      }

      public void AddWebPage( Uri baseUri, string newUri )
      {
         // Remove any anchors
         string url = StrUtil.LeftIndexOf( newUri, "#" );   

         // Construct a Uri, using the current page Uri as a base reference
         Uri    uri = new Uri( baseUri, url );

         if ( ! ValidPageExtension( uri.LocalPath ) )
         {
//            Log( "Uri not processed: " + uri + " - Invalid Extension" );

            return;
         }

         if ( m_webPages.Contains( uri ) )
         {
            // Log( "Add WebPage: " + uri + " - Allready exists" );
         }
         else
         {
//            Log( "Add WebPage: " + uri );

            WebPageState state = new WebPageState( uri );

            // Only process pages that are with in the same location as this site

            if ( uri.AbsoluteUri.StartsWith( BaseUri.AbsoluteUri ) )
            {
               m_webPagesPending.Enqueue  ( state );
            }

            m_webPages.Add             ( uri, state );
         }
      }
      #endregion

      #region local interface

      private static readonly string[] m_validPageExtensions = new string[]
            {"html", "php", "asp", "htm", "jsp", "shtml", "php3", "aspx", "pl", "cfm" };

      private bool ValidPageExtension( string path ) 
      { 
         int   pos      = path.IndexOf( "." );

         if ( pos == -1 || path[ path.Length-1 ] == 47 ) /*.ToString( ).Equals( "/" )*/ 
         {
            return true;
         }

         string uriExt = StrUtil.RightOf( path, pos ).ToLower( );

         // Uri ends in an extension
         foreach ( string ext in m_validPageExtensions )
         {
            if ( uriExt.Equals( ext ) )
            {
               return true;
            }
         }

         return false;
      }

      #endregion

      #region Log
      private void Log( string v ) 
      { 
         _.P( v );
      }
      private void Log( string format, object o1 ) 
      { 
         _.P( format, o1 );
      }
      private void Log( string format, object o1, object o2 ) 
      { 
         _.P( format, o1, o2 );
      }
      private void Log( string format, object o1, object o2, object o3 ) 
      { 
         _.P( format, o1, o2, o3 );
      }
      #endregion

      #region properties
      public int ThreadCount
      {
         get
         {
            return m_threadCount;
         }
         set
         {
            m_threadCount = value;
         }
      }
      public int ThreadCountMax
      {
         get
         {
            return m_threadCountMax;
         }
         set
         {  // Must be between 1 && MAX_THREADS
            m_threadCountMax = ( value<1 ? 1 : ( value>MAX_THREADS ? MAX_THREADS : value ) );
         }
      }
      public Uri StartUri
      {
         get
         {
            return m_startUri;
         }
         set
         {  
            m_startUri = value;
         }
      }
      public Uri BaseUri
      {
         get
         {
            return m_baseUri;
         }
         set
         {  
            m_baseUri = value;
         }
      }
      public int UriProcessedCount
      {
         get
         {
            return m_uriProcessedCount;
         }
      }
      public int UriProcessedCountMax
      {
         get
         {
            return m_uriProcessedCountMax;
         }
         set
         {  
            m_uriProcessedCountMax = value;
         }
      }
      public bool SaveWebContent
      {
         get
         {
            return m_saveWebContent;
         }
         set
         {  
            m_saveWebContent = value;
         }
      }
      public Hashtable WebPages
      {
         get
         {
            return m_webPages;
         }
      }
      internal ManualResetEvent ResetEvent
      {
         get
         {
            return m_resetEvent;
         }
         set
         {
            m_resetEvent = value;
         }
      }
      #endregion
   }

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -