📄 parser.cs
字号:
using System;
using System.Collections;
using System.Text;
using System.IO;
namespace GoogleParser
{
/// <summary>
/// Summary description for Parser.
/// </summary>
public class Parser:SinglePage.Parser
{
/// <summary>
/// Construct an URL of Google Search.
/// </summary>
/// <param name="words">Query words</param>
/// <param name="start">start rank</param>
/// <param name="maxResults">max result length</param>
/// <returns>URL</returns>
private static String ConstructURL(String words,int start,int maxResults)
{
return "http://www.google.com/search?q="+words+"&num="+maxResults.ToString()+"&hl=en&lr=&start="+start.ToString();
}
/// <summary>
/// Get HTML code of Google Search (Given words,start,max result count).
/// </summary>
/// <param name="words">Query words</param>
/// <param name="start">start rank</param>
/// <param name="maxResults">max result count</param>
/// <returns>HTML code string, null if Unable to process request at this time.</returns>
public static String GetHTML(String words,int start,int maxResults)
{
//Construct URL
String url = GoogleParser.Parser.ConstructURL(words,start,maxResults);
//Download HTML code
String html="";
if(SinglePage.SinglePage.Download(url,ref html))
return html;
else
return null;
}
/// <summary>
/// Get search results of given words and range from Google Search web page.
/// </summary>
/// <param name="words">Query words</param>
/// <param name="start">start rank</param>
/// <param name="maxResults">max result length</param>
/// <returns>Array List GoogleParser.Parser.Record[] of result records. Return NULL if there is no results.</returns>
public static ArrayList Run(String words,int start,int maxResults)
{
//Download HTML code
String html=Parser.GetHTML(words,start,maxResults);
//Parse HTML code
if(html==null)
throw new System.Net.WebException("Google Unable to process request at this time.");
else
return Parser.Run(html);
}
/// <summary>
/// Parse Google Search page, and get all result records in the page.
/// </summary>
/// <param name="HTML">HTML code to parse.</param>
/// <returns>Array List GoogleParser.Parser.Record[] of result records. Return NULL if there is no results.</returns>
public static ArrayList Run(String HTML)
{
ArrayList ret = new ArrayList();
StringBuilder sb = new StringBuilder(HTML);
//*************Parse Google by its typical HTML code******
//<p class=g>
//<a class=l href="URL" onmousedown="return clk(this.href,'','','res','2','')">Title</a>
//<table cellpadding=0 cellspacing=0 border=0 id="table1">
//<tr><td class=j><font size=-1>Snippet<br>
//<font color=#008000>
//..........
//Similar pages</a>
String startFlag = "<p class=g>";
String endFlag = "<font color=#008000>"; //some records only have one of the end flags.
String endFlag2 = "Similar pages</a>";
while(true)
{
int start = sb.ToString().IndexOf(startFlag);
//If not found, parsing finished.
if(start==-1)
break;
int end = sb.ToString().IndexOf(endFlag,start);
int end2 = sb.ToString().IndexOf(endFlag2,start);
if(end==-1||(end2!=-1&&end>end2))
end = end2;
//If not found, parsing finished.
if(end==-1)
break;
//Parse record unit
String recordHTML = sb.ToString().Substring(start,end-start);
sb.Remove(start,end-start+endFlag.Length);
Parser.Record r = ParseRecord(recordHTML);
if(r!=null)
ret.Add(r);
}
if(ret.Count==0)
return null;
else
return ret;
}
private static Parser.Record ParseRecord(String recordHTML)
{
//*************Parse Google by its typical HTML code******
//<p class=g>
//<a class=l href="URL" onmousedown="return clk(this.href,'','','res','2','')">Title</a>
//<table cellpadding=0 cellspacing=0 border=0 id="table1">
//<tr><td class=j><font size=-1>Snippet<br>
String URL,title,snippet;
int start, end;
try
{
//Parse URL
start = recordHTML.IndexOf("href=\"")+"href=\"".Length;
end = recordHTML.IndexOf("\" onmousedown")-1;
if(start==-1||end==-1)
return null;
URL = recordHTML.Substring(start,end-start+1);
//Parse Title
start = recordHTML.IndexOf("'')\">")+"'')\">".Length;
end = recordHTML.IndexOf("</a>")-1;
if(start==-1||end==-1)
return null;
title = recordHTML.Substring(start,end-start+1).Replace("<b>","").Replace("</b>","");
//Parse Snippet
start = recordHTML.IndexOf("<font size=-1>");
end = recordHTML.LastIndexOf("<br>");
if(end<0)
end = recordHTML.LastIndexOf("<nobr>");
if(start==-1||end==-1)
return null;
snippet = recordHTML.Substring(start+"<font size=-1>".Length,end-start-"<font size=-1>".Length).Replace("<b>","").Replace("</b>","");
return new Parser.Record(URL,title,snippet);
}
catch(Exception ex)
{
SinglePage.SinglePage.LogAppend("Google Record Parser Error:\r\n"+recordHTML+"\n[Detail:] "+ex.Message);
return null;
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -