⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 parser.cs

📁 用C#编写的一个款搜索engine的源代码!摘自<Visual c#2005 程序设计>
💻 CS
字号:
using System;
using System.Collections;
using System.Text;
using System.IO;

namespace GoogleParser
{
	/// <summary>
	/// Summary description for Parser.
	/// </summary>
	public class Parser:SinglePage.Parser
	{
		/// <summary>
		/// Construct an URL of Google Search.
		/// </summary>
		/// <param name="words">Query words</param>
		/// <param name="start">start rank</param>
		/// <param name="maxResults">max result length</param>
		/// <returns>URL</returns>
		private static String ConstructURL(String words,int start,int maxResults)
		{
			return "http://www.google.com/search?q="+words+"&num="+maxResults.ToString()+"&hl=en&lr=&start="+start.ToString();
		}

		/// <summary>
		/// Get HTML code of Google Search (Given words,start,max result count).
		/// </summary>
		/// <param name="words">Query words</param>
		/// <param name="start">start rank</param>
		/// <param name="maxResults">max result count</param>
		/// <returns>HTML code string, null if Unable to process request at this time.</returns>
		public static String GetHTML(String words,int start,int maxResults)
		{
			//Construct URL
			String url = GoogleParser.Parser.ConstructURL(words,start,maxResults);
			//Download HTML code
			String html="";

			if(SinglePage.SinglePage.Download(url,ref html))
				return html;
			else
				return null;
		}

		/// <summary>
		/// Get search results of given words and range from Google Search web page.
		/// </summary>
		/// <param name="words">Query words</param>
		/// <param name="start">start rank</param>
		/// <param name="maxResults">max result length</param>
		/// <returns>Array List GoogleParser.Parser.Record[] of result records. Return NULL if there is no results.</returns>
		public static ArrayList Run(String words,int start,int maxResults)
		{
			//Download HTML code
			String html=Parser.GetHTML(words,start,maxResults);
			//Parse HTML code
			if(html==null)
				throw new System.Net.WebException("Google Unable to process request at this time.");
			else
				return Parser.Run(html);
		}

		/// <summary>
		/// Parse Google Search page, and get all result records in the page.
		/// </summary>
		/// <param name="HTML">HTML code to parse.</param>
		/// <returns>Array List GoogleParser.Parser.Record[] of result records. Return NULL if there is no results.</returns>
		public static ArrayList Run(String HTML)
		{
			ArrayList ret = new ArrayList();
			StringBuilder sb = new StringBuilder(HTML);
			//*************Parse Google by its typical HTML code******
			//<p class=g>
			//<a class=l href="URL" onmousedown="return clk(this.href,'','','res','2','')">Title</a>
			//<table cellpadding=0 cellspacing=0 border=0 id="table1">
			//<tr><td class=j><font size=-1>Snippet<br>
			//<font color=#008000>
			//..........
			//Similar&nbsp;pages</a>
			String startFlag = "<p class=g>";
			String endFlag   = "<font color=#008000>";  //some records only have one of the end flags.
			String endFlag2  = "Similar&nbsp;pages</a>";  

			while(true)
			{
				int start = sb.ToString().IndexOf(startFlag);
				//If not found, parsing finished.
				if(start==-1)
					break;
				int end   = sb.ToString().IndexOf(endFlag,start);
				int end2  = sb.ToString().IndexOf(endFlag2,start);
				if(end==-1||(end2!=-1&&end>end2))
					end = end2;
				//If not found, parsing finished.
				if(end==-1)
					break;
				//Parse record unit
				String recordHTML = sb.ToString().Substring(start,end-start);
				sb.Remove(start,end-start+endFlag.Length);
				Parser.Record r = ParseRecord(recordHTML);
				if(r!=null) 
					ret.Add(r);
			}
			if(ret.Count==0)
				return null;
			else
				return ret;
		}

		private static Parser.Record ParseRecord(String recordHTML)
		{
			//*************Parse Google by its typical HTML code******
			//<p class=g>
			//<a class=l href="URL" onmousedown="return clk(this.href,'','','res','2','')">Title</a>
			//<table cellpadding=0 cellspacing=0 border=0 id="table1">
			//<tr><td class=j><font size=-1>Snippet<br>
			String URL,title,snippet;
			int start, end;
			try
			{
				//Parse URL
				start = recordHTML.IndexOf("href=\"")+"href=\"".Length;
				end = recordHTML.IndexOf("\" onmousedown")-1;
				if(start==-1||end==-1) 
					return null;
				URL = recordHTML.Substring(start,end-start+1);

				//Parse Title
				start = recordHTML.IndexOf("'')\">")+"'')\">".Length;
				end = recordHTML.IndexOf("</a>")-1;
				if(start==-1||end==-1) 
					return null;
				title = recordHTML.Substring(start,end-start+1).Replace("<b>","").Replace("</b>","");

				//Parse Snippet
				start = recordHTML.IndexOf("<font size=-1>");
				end = recordHTML.LastIndexOf("<br>");
				if(end<0) 
					end = recordHTML.LastIndexOf("<nobr>");
				if(start==-1||end==-1) 
					return null;
				snippet = recordHTML.Substring(start+"<font size=-1>".Length,end-start-"<font size=-1>".Length).Replace("<b>","").Replace("</b>","");
			
				return new Parser.Record(URL,title,snippet);
			}
			catch(Exception ex)
			{
				SinglePage.SinglePage.LogAppend("Google Record Parser Error:\r\n"+recordHTML+"\n[Detail:] "+ex.Message);
				return null;
			}
		}

	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -