📄 parser.cs
字号:
using System;
using System.IO;
using System.Text;
using System.Collections;
namespace MSNParser
{
/// <summary>
/// Summary description for Parser.
/// </summary>
public class Parser:SinglePage.Parser
{
/// <summary>
/// Construct an URL of MSN Search.
/// </summary>
/// <param name="words">Query words</param>
/// <param name="start">start rank</param>
/// <returns>URL</returns>
private static String ConstructURL(String words,int start,int maxResults)
{
return "http://search.msn.com/results.aspx?q="+words+"&first="+start.ToString();
}
public static String GetHTML(String words,int start,int maxResults)
{
//Construct URL
String url = Parser.ConstructURL(words,start,maxResults);
//Download HTML code
String html="";
if(SinglePage.SinglePage.Download(url,ref html))
return html;
else
return null;
}
/// <summary>
/// Get search results of given words and range from MSN Search web page.
/// </summary>
/// <param name="words">Query words</param>
/// <param name="start">start rank</param>
/// <param name="maxResults">max result length</param>
/// <returns>Array List MSNParser.Parser.Record[] of result records. Return NULL if there is no results.</returns>
public static ArrayList Run(String words,int start,int maxResults)
{
//Results rank must less than 250
if(start>250)
return null;
//Download HTML code
String html=Parser.GetHTML(words,start,maxResults);
//Parse HTML code
return Parser.Run(html);
}
/// <summary>
/// Parse MSN Search page, and get all result records in the page.
/// </summary>
/// <param name="HTML">HTML code to parse.</param>
/// <returns>Array List MSNParser.Parser.Record[] of result records. Return NULL if there is no results.</returns>
public static ArrayList Run(String HTML)
{
ArrayList ret = new ArrayList();
//Get Result Section, if there is a section
//<h2>Results</h2>
//.......
//<h2>SPONSORED SITES</h2> (possible not exists)
StringBuilder sb;
int resultStart = HTML.IndexOf("<h2>Results</h2>");
if(resultStart!=-1)
sb = new StringBuilder(HTML.Substring(resultStart,HTML.Length-resultStart));
else
sb = new StringBuilder(HTML);
int resultEnd = sb.ToString().IndexOf("<h2>SPONSORED SITES</h2>");
if(resultEnd!=-1)
sb.Remove(resultEnd,sb.ToString().Length-resultEnd);
//*************Parse MSN by its typical HTML code******
//<H3><A href=":::URL:::">:::Title:::</A></H3>
// <P>:::Snippet:::</P>
// <UL>
// <LI class=first>.....URL Presentation......
// <LI><A
// href="http://cc.msnscache.com/cache.aspx?q=3271652021366&lang=en-US&mkt=en-US&FORM=CVRE">Cached
// page</A> </LI></UL>
// <LI>
String startFlag = "<h3><a href=\"";
String endFlag = "<li class=\"first\">";
while(true)
{
int start = sb.ToString().IndexOf(startFlag);
int end = sb.ToString().IndexOf(endFlag);
//If not found, parsing finished.
if(start==-1||end==-1)
break;
//If format error, parsing finished.
if(start>=end)
{
SinglePage.SinglePage.LogAppend("MSN Record Split Error:\r\n"+sb.ToString());
break;
}
//Parse record unit
String recordHTML = sb.ToString().Substring(start,end-start);
sb.Remove(start,end-start+endFlag.Length);
Parser.Record r = ParseRecord(recordHTML);
if(r!=null)
ret.Add(r);
}
if(ret.Count==0)
return null;
else
return ret;
}
private static Parser.Record ParseRecord(String recordHTML)
{
//*************Parse MSN by its typical HTML code******
//<H3><A href=":::URL:::">:::Title:::</A></H3>
// <P>:::Snippet:::</P>
// <UL>
// <LI class=first>.....URL Presentation......
// <LI><A
// href="http://cc.msnscache.com/cache.aspx?q=3271652021366&lang=en-US&mkt=en-US&FORM=CVRE">Cached
// page</A> </LI></UL>
// <LI>
String URL,title,snippet="";
int start, end;
try
{
//Parse URL
start = recordHTML.IndexOf("href=\"");
if(start==-1)
return null;
else
start += "href=\"".Length;
end = recordHTML.IndexOf("\"",start,recordHTML.Length-start);
if(end==-1)
return null;
URL = recordHTML.Substring(start,end-start);
//Parse Title
start = recordHTML.IndexOf("\">")+"\">".Length;
end = recordHTML.IndexOf("</a></h3>")-1;
if(start==-1||end==-1)
return null;
title = recordHTML.Substring(start,end-start+1).Replace("<strong>","").Replace("</strong>","");
//Parse Snippet
start = recordHTML.IndexOf("<p>");
end = recordHTML.IndexOf("</p>");
if(start!=-1&&end!=-1) //if there is snippet
{
snippet = recordHTML.Substring(start+"<p>".Length,end-start-"<p>".Length);
snippet = snippet.Replace("<strong>","").Replace("</strong>","");
}
return new Parser.Record(URL,title,snippet);
}
catch(Exception ex)
{
SinglePage.SinglePage.LogAppend("MSN Record Parser Error:\r\n"+recordHTML+"\n[Detail:] "+ex.Message);
return null;
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -