📄 parser.cs
字号:
using System;
using System.IO;
using System.Text;
using System.Collections;
namespace YahooParser
{
/// <summary>
/// Summary description for Parser.
/// </summary>
public class Parser:SinglePage.Parser
{
/// <summary>
/// Construct an URL for Yahoo Search.
/// </summary>
/// <param name="words">Query words</param>
/// <param name="start">start rank</param>
/// <param name="maxResults">max result count</param>
/// <returns>URL to search</returns>
private static String ConstructURL(String words,int start,int maxResults)
{
return "http://search.yahoo.com/search?ei=UTF-8&p="+words+"&b="+start.ToString()+"&n="+maxResults.ToString();
}
/// <summary>
/// Get HTML code of Yahoo Search (Given words,start,max result count).
/// </summary>
/// <param name="words">Query words</param>
/// <param name="start">start rank</param>
/// <param name="maxResults">max result count</param>
/// <returns>HTML code string, null if Unable to process request at this time.</returns>
public static String GetHTML(String words,int start,int maxResults)
{
//Construct URL
String url = YahooParser.Parser.ConstructURL(words,start,maxResults);
//Download HTML code
String html="";
if(SinglePage.SinglePage.Download(url,ref html))
return html;
else
return null;
}
/// <summary>
/// Get search results of given words and range from Yahoo Search web page.
/// Throw exception System.Net.WebException, if Yahoo is Unable to process request at this time.
/// </summary>
/// <param name="words">Query words</param>
/// <param name="start">start rank</param>
/// <param name="maxResults">max result length</param>
/// <returns>Array List YahooParser.Parser.Record[] of result records. Return NULL if there is no results.
/// Throw System.Net.WebException if unable to search on Yahoo.</returns>
public static ArrayList Run(String words,int start,int maxResults)
{
//Download HTML code
String html=Parser.GetHTML(words,start,maxResults);
//Parse HTML code
if(html==null)
throw new System.Net.WebException("Yahoo Unable to process request at this time.");
else
return Parser.Run(html);
}
/// <summary>
/// Parse Yahoo Search page, and get all result records in the page.
/// </summary>
/// <param name="HTML">HTML code to parse.</param>
/// <returns>Array List YahooParser.Parser.Record[] of result records. Return NULL if there is no results.</returns>
public static ArrayList Run(String HTML)
{
ArrayList ret = new ArrayList();
StringBuilder sb = new StringBuilder(HTML);
//*************Parse Yahoo by its typical HTML code******
//<li><div><a class=yschttl
//href="http://rds.yahoo.com/_ylt=A0geu..........0335/**http%3a//www.google.com/"><b>
//:::Title:::</b></a>
//<a href="http://rds.yahoo.com/_ylt=A0geu.........0335/**http%3a//www.google.com/" target=_blank>
// <img src="nw2.gif" height=11 width=11 border=0 alt="Open this result in new window"></a>
// (Nasdaq: <b><a href="http://rds.yahoo.com/_ylt=A0geu.......0335/**http%3a//finance.yahoo.com/q%3fs=GOOG%26d=t" class=yschls>
// GOOG</a></b>)
//</div>
// <div class=yschabstr>:::Snippet:::</div>
// <div> Category: <a href="http://rds.yahoo.com/_ylt=A0ge.......nd_World_Wide_Web/Search_and_Navigation/">
// Web Search and Navigation Services</a></div>
//
//<em class=yschurl>www.<b>google</b>.com</em>
// - <a href="http://rds.yahoo.com/_ylt=A0ge.........0%26vs=www.google.com">
//More from this site</a>
//<span class=yschprs><em> -
//<a href="http://us.rd.yahoo.com/evt=383...........return false" >
//Save</a></em></span>
String startFlag = "<li><div><a class=yschttl";
String endFlag = "<span class=yschprs><em>";
while(true)
{
int start = sb.ToString().IndexOf(startFlag);
int end = sb.ToString().IndexOf(endFlag);
//If not found, parsing finished.
if(start==-1||end==-1)
break;
//Parse record unit
String recordHTML = sb.ToString().Substring(start,end-start);
sb.Remove(start,end-start+endFlag.Length);
Parser.Record r = ParseRecord(recordHTML);
if(r!=null)
ret.Add(r);
}
if(ret.Count==0)
return null;
else
return ret;
}
private static Parser.Record ParseRecord(String recordHTML)
{
//*************Parse Yahoo by its typical HTML code******
//<li><div><a class=yschttl
//href="http://rds.yahoo.com/_ylt=A0geu..........0335/**http%3a//www.google.com/">
//:::Title:::</a>
//<a href="http://rds.yahoo.com/_ylt=A0geu.........0335/**http%3a//www.google.com/" target=_blank>
// <img src="nw2.gif" height=11 width=11 border=0 alt="Open this result in new window"></a>
// (Nasdaq: <b><a href="http://rds.yahoo.com/_ylt=A0geu.......0335/**http%3a//finance.yahoo.com/q%3fs=GOOG%26d=t" class=yschls>
// GOOG</a></b>)
//</div>
// <div class=yschabstr>:::Snippet:::</div>
// <div> Category: <a href="http://rds.yahoo.com/_ylt=A0ge.......nd_World_Wide_Web/Search_and_Navigation/">
// Web Search and Navigation Services</a></div>
//
//<em class=yschurl>www.<b>google</b>.com</em>
// - <a href="http://rds.yahoo.com/_ylt=A0ge.........0%26vs=www.google.com">
//More from this site</a>
//<span class=yschprs><em> -
//<a href="http://us.rd.yahoo.com/evt=383...........return false" >
//Save</a></em></span>
String URL,title,snippet="";
int start, end;
try
{
//Parse URL
start = recordHTML.IndexOf("**http%3a");
if(start==-1)
return null;
start += "**http%3a".Length;
end = recordHTML.IndexOf("\">");
//If no record is found in this spilted code, return null.
if(end==-1)
return null;
if(start>=end)
return null;
URL = "http:"+recordHTML.Substring(start,end-start);
//Parse Title
start = recordHTML.IndexOf("\">");
end = recordHTML.IndexOf("</a>");
if(start==-1||end==-1)
return null;
title = recordHTML.Substring(start+"\">".Length,end-start-"\">".Length).Replace("<b>","").Replace("</b>","");
//Parse Snippet
start = recordHTML.IndexOf("<div class=yschabstr>");
if(start!=-1) //start point exists
{
int i;
for(i=start;i<recordHTML.Length;i++)
{
if(recordHTML[i]=='<'
&&recordHTML[i+1]=='/'
&&recordHTML[i+2]=='d'
&&recordHTML[i+3]=='i'
&&recordHTML[i+4]=='v'
&&recordHTML[i+5]=='>')
{
end = i;
break;
}
}
if(i==recordHTML.Length)
end = -1;
}
if(start!=-1&&end!=-1) //if there is snippet
snippet = recordHTML.Substring(start+"<div class=yschabstr>".Length,end-start-"<div class=yschabstr>".Length);
return new Parser.Record( URL,title,snippet.Replace("<b>","").Replace("</b>","") );
}
catch(Exception ex)
{
SinglePage.SinglePage.LogAppend("Yahoo Record Parser Error:\r\n"+recordHTML+"\r\n[Detail:] "+ex.Message);
return null;
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -