⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 parser.cs

📁 用C#编写的一个款搜索engine的源代码!摘自<Visual c#2005 程序设计>
💻 CS
字号:
using System;
using System.IO;
using System.Text;
using System.Collections;

namespace MSNParser
{
	/// <summary>
	/// Summary description for Parser.
	/// </summary>
	public class Parser:SinglePage.Parser
	{
		/// <summary>
		/// Construct an URL of MSN Search.
		/// </summary>
		/// <param name="words">Query words</param>
		/// <param name="start">start rank</param>
		/// <returns>URL</returns>
		private static String ConstructURL(String words,int start,int maxResults)
		{
			return "http://search.msn.com/results.aspx?q="+words+"&first="+start.ToString();
		}

		public static String GetHTML(String words,int start,int maxResults)
		{
			//Construct URL
			String url = Parser.ConstructURL(words,start,maxResults);
			//Download HTML code
			String html="";

			if(SinglePage.SinglePage.Download(url,ref html))
				return html;
			else
				return null;
		}

		/// <summary>
		/// Get search results of given words and range from MSN Search web page.
		/// </summary>
		/// <param name="words">Query words</param>
		/// <param name="start">start rank</param>
		/// <param name="maxResults">max result length</param>
		/// <returns>Array List MSNParser.Parser.Record[] of result records. Return NULL if there is no results.</returns>
		public static ArrayList Run(String words,int start,int maxResults)
		{
			//Results rank must less than 250
			if(start>250)
				return null;
			//Download HTML code
			String html=Parser.GetHTML(words,start,maxResults);
			//Parse HTML code
			return Parser.Run(html);
		}

		/// <summary>
		/// Parse MSN Search page, and get all result records in the page.
		/// </summary>
		/// <param name="HTML">HTML code to parse.</param>
		/// <returns>Array List MSNParser.Parser.Record[] of result records. Return NULL if there is no results.</returns>
		public static ArrayList Run(String HTML)
		{
			ArrayList ret = new ArrayList();

			//Get Result Section, if there is a section
			//<h2>Results</h2>
			//.......
			//<h2>SPONSORED SITES</h2>  (possible not exists)
			StringBuilder sb;
			int resultStart = HTML.IndexOf("<h2>Results</h2>");
			if(resultStart!=-1)
				sb = new StringBuilder(HTML.Substring(resultStart,HTML.Length-resultStart));
			else
				sb = new StringBuilder(HTML);

			int resultEnd = sb.ToString().IndexOf("<h2>SPONSORED SITES</h2>");
			if(resultEnd!=-1)
				sb.Remove(resultEnd,sb.ToString().Length-resultEnd);



			//*************Parse MSN by its typical HTML code******
			//<H3><A href=":::URL:::">:::Title:::</A></H3>
			//  <P>:::Snippet:::</P>
			//  <UL>
			//    <LI class=first>.....URL Presentation......
			//    <LI><A 
			//    href="http://cc.msnscache.com/cache.aspx?q=3271652021366&amp;lang=en-US&amp;mkt=en-US&amp;FORM=CVRE">Cached 
			//    page</A> </LI></UL>
			//  <LI>

			String startFlag = "<h3><a href=\"";
			String endFlag   = "<li class=\"first\">";  

			while(true)
			{
				int start = sb.ToString().IndexOf(startFlag);
				int end   = sb.ToString().IndexOf(endFlag);
				//If not found, parsing finished.
				if(start==-1||end==-1)
					break;
				//If format error, parsing finished.
				if(start>=end)
				{
					SinglePage.SinglePage.LogAppend("MSN Record Split Error:\r\n"+sb.ToString());
					break;
				}
				//Parse record unit
				String recordHTML = sb.ToString().Substring(start,end-start);
				sb.Remove(start,end-start+endFlag.Length);
				Parser.Record r = ParseRecord(recordHTML);
				if(r!=null) 
					ret.Add(r);
			}
			if(ret.Count==0)
				return null;
			else
				return ret;
		}

		private static Parser.Record ParseRecord(String recordHTML)
		{
			//*************Parse MSN by its typical HTML code******
			//<H3><A href=":::URL:::">:::Title:::</A></H3>
			//  <P>:::Snippet:::</P>
			//  <UL>
			//    <LI class=first>.....URL Presentation......
			//    <LI><A 
			//    href="http://cc.msnscache.com/cache.aspx?q=3271652021366&amp;lang=en-US&amp;mkt=en-US&amp;FORM=CVRE">Cached 
			//    page</A> </LI></UL>
			//  <LI>
			String URL,title,snippet="";
			int start, end;
			try
			{
				//Parse URL
				start = recordHTML.IndexOf("href=\"");
				if(start==-1)
					return null;
				else
					start += "href=\"".Length;

				end = recordHTML.IndexOf("\"",start,recordHTML.Length-start);
				if(end==-1) 
					return null;
				URL = recordHTML.Substring(start,end-start);

				//Parse Title
				start = recordHTML.IndexOf("\">")+"\">".Length;
				end = recordHTML.IndexOf("</a></h3>")-1;
				if(start==-1||end==-1) 
					return null;
				title = recordHTML.Substring(start,end-start+1).Replace("<strong>","").Replace("</strong>","");

				//Parse Snippet
				start = recordHTML.IndexOf("<p>");
				end = recordHTML.IndexOf("</p>");
				if(start!=-1&&end!=-1)  //if there is snippet
				{
					snippet = recordHTML.Substring(start+"<p>".Length,end-start-"<p>".Length);
					snippet = snippet.Replace("<strong>","").Replace("</strong>","");
				}
			
				return new Parser.Record(URL,title,snippet);
			}
			catch(Exception ex)
			{
				SinglePage.SinglePage.LogAppend("MSN Record Parser Error:\r\n"+recordHTML+"\n[Detail:] "+ex.Message);
				return null;
			}
		}

	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -