📄 parser.cs

📁 用C#编写的一个款搜索engine的源代码！摘自<Visual c#2005 程序设计>
💻 CS
字号:
using System;
using System.IO;
using System.Text;
using System.Collections;

namespace YahooParser
{
	/// <summary>
	/// Summary description for Parser.
	/// </summary>
	public class Parser:SinglePage.Parser
	{
		/// <summary>
		/// Construct an URL for Yahoo Search.
		/// </summary>
		/// <param name="words">Query words</param>
		/// <param name="start">start rank</param>
		/// <param name="maxResults">max result count</param>
		/// <returns>URL to search</returns>
		private static String ConstructURL(String words,int start,int maxResults)
		{
			return "http://search.yahoo.com/search?ei=UTF-8&p="+words+"&b="+start.ToString()+"&n="+maxResults.ToString();
		}

		/// <summary>
		/// Get HTML code of Yahoo Search (Given words,start,max result count).
		/// </summary>
		/// <param name="words">Query words</param>
		/// <param name="start">start rank</param>
		/// <param name="maxResults">max result count</param>
		/// <returns>HTML code string, null if Unable to process request at this time.</returns>
		public static String GetHTML(String words,int start,int maxResults)
		{
			//Construct URL
			String url = YahooParser.Parser.ConstructURL(words,start,maxResults);
			//Download HTML code
			String html="";
			
			if(SinglePage.SinglePage.Download(url,ref html))
				return html;
			else
				return null;
		}

		/// <summary>
		/// Get search results of given words and range from Yahoo Search web page.
		/// Throw exception System.Net.WebException, if Yahoo is Unable to process request at this time.
		/// </summary>
		/// <param name="words">Query words</param>
		/// <param name="start">start rank</param>
		/// <param name="maxResults">max result length</param>
		/// <returns>Array List YahooParser.Parser.Record[] of result records. Return NULL if there is no results.
		/// Throw System.Net.WebException if unable to search on Yahoo.</returns>
		public static ArrayList Run(String words,int start,int maxResults)
		{
			//Download HTML code
			String html=Parser.GetHTML(words,start,maxResults);
			//Parse HTML code
			if(html==null)
				throw new System.Net.WebException("Yahoo Unable to process request at this time.");
			else
				return Parser.Run(html);
		}

		/// <summary>
		/// Parse Yahoo Search page, and get all result records in the page.
		/// </summary>
		/// <param name="HTML">HTML code to parse.</param>
		/// <returns>Array List YahooParser.Parser.Record[] of result records. Return NULL if there is no results.</returns>
		public static ArrayList Run(String HTML)
		{
			ArrayList ret = new ArrayList();
			StringBuilder sb = new StringBuilder(HTML);
			//*************Parse Yahoo by its typical HTML code******
			//<li><div><a class=yschttl  
			//href="http://rds.yahoo.com/_ylt=A0geu..........0335/**http%3a//www.google.com/"><b>
			//:::Title:::</b></a> 
			//<a href="http://rds.yahoo.com/_ylt=A0geu.........0335/**http%3a//www.google.com/" target=_blank>
			//	<img src="nw2.gif" height=11 width=11 border=0 alt="Open this result in new window"></a>
			// 	(Nasdaq: <b><a href="http://rds.yahoo.com/_ylt=A0geu.......0335/**http%3a//finance.yahoo.com/q%3fs=GOOG%26d=t" class=yschls>
			//	GOOG</a></b>)
			//</div>
			//	  <div class=yschabstr>:::Snippet:::</div>
			//	<div> Category: <a href="http://rds.yahoo.com/_ylt=A0ge.......nd_World_Wide_Web/Search_and_Navigation/">
			//	Web Search and Navigation Services</a></div>
			//
			//<em class=yschurl>www.<b>google</b>.com</em>
			// - <a href="http://rds.yahoo.com/_ylt=A0ge.........0%26vs=www.google.com">
			//More from this site</a>
			//<span class=yschprs><em> - 
			//<a href="http://us.rd.yahoo.com/evt=383...........return false" >
			//Save</a></em></span>

			String startFlag = "<li><div><a class=yschttl";
			String endFlag   = "<span class=yschprs><em>";  

			while(true)
			{
				int start = sb.ToString().IndexOf(startFlag);
				int end   = sb.ToString().IndexOf(endFlag);
				//If not found, parsing finished.
				if(start==-1||end==-1)
					break;
				//Parse record unit
				String recordHTML = sb.ToString().Substring(start,end-start);
				sb.Remove(start,end-start+endFlag.Length);
				Parser.Record r = ParseRecord(recordHTML);
				if(r!=null) 
					ret.Add(r);
			}
			if(ret.Count==0)
				return null;
			else
				return ret;
		}

		private static Parser.Record ParseRecord(String recordHTML)
		{
			//*************Parse Yahoo by its typical HTML code******
			//<li><div><a class=yschttl  
			//href="http://rds.yahoo.com/_ylt=A0geu..........0335/**http%3a//www.google.com/">
			//:::Title:::</a> 
			//<a href="http://rds.yahoo.com/_ylt=A0geu.........0335/**http%3a//www.google.com/" target=_blank>
			//	<img src="nw2.gif" height=11 width=11 border=0 alt="Open this result in new window"></a>
			// 	(Nasdaq: <b><a href="http://rds.yahoo.com/_ylt=A0geu.......0335/**http%3a//finance.yahoo.com/q%3fs=GOOG%26d=t" class=yschls>
			//	GOOG</a></b>)
			//</div>
			//	  <div class=yschabstr>:::Snippet:::</div>
			//	<div> Category: <a href="http://rds.yahoo.com/_ylt=A0ge.......nd_World_Wide_Web/Search_and_Navigation/">
			//	Web Search and Navigation Services</a></div>
			//
			//<em class=yschurl>www.<b>google</b>.com</em>
			// - <a href="http://rds.yahoo.com/_ylt=A0ge.........0%26vs=www.google.com">
			//More from this site</a>
			//<span class=yschprs><em> - 
			//<a href="http://us.rd.yahoo.com/evt=383...........return false" >
			//Save</a></em></span>

			String URL,title,snippet="";
			int start, end;
			try
			{
				//Parse URL
				start = recordHTML.IndexOf("**http%3a");
				if(start==-1)
					return null;
				start += "**http%3a".Length;
				end = recordHTML.IndexOf("\">");
				//If no record is found in this spilted code, return null.
				if(end==-1) 
					return null;
				if(start>=end) 
					return null;
				URL = "http:"+recordHTML.Substring(start,end-start);

				//Parse Title
				start = recordHTML.IndexOf("\">");
				end = recordHTML.IndexOf("</a>");
				if(start==-1||end==-1) 
					return null;
				title = recordHTML.Substring(start+"\">".Length,end-start-"\">".Length).Replace("<b>","").Replace("</b>","");

				//Parse Snippet
				start = recordHTML.IndexOf("<div class=yschabstr>");
				if(start!=-1)  //start point exists
				{
					int i;
					for(i=start;i<recordHTML.Length;i++)
					{
						if(recordHTML[i]=='<'
							&&recordHTML[i+1]=='/'
							&&recordHTML[i+2]=='d'
							&&recordHTML[i+3]=='i'
							&&recordHTML[i+4]=='v'
							&&recordHTML[i+5]=='>')
						{
							end = i;
							break;
						}
					}
					if(i==recordHTML.Length) 
						end = -1;
				}
					
				if(start!=-1&&end!=-1)  //if there is snippet
					snippet = recordHTML.Substring(start+"<div class=yschabstr>".Length,end-start-"<div class=yschabstr>".Length);
				
				return new Parser.Record( URL,title,snippet.Replace("<b>","").Replace("</b>","") );
			}
			catch(Exception ex)
			{
				SinglePage.SinglePage.LogAppend("Yahoo Record Parser Error:\r\n"+recordHTML+"\r\n[Detail:] "+ex.Message);
				return null;
			}
		}
		

	}
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -