searchresult2.java
来自「利用多线程从搜索引擎下载网页并提取数据到数据库。」· Java 代码 · 共 329 行
JAVA
329 行
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.Parser;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.nodes.AbstractNode;
import org.htmlparser.util.ParserException;
/**
* Read stored links from table search_result_prepare, scrape each page for
* links to persons' profile on LinkIn.Com
*
* @author james
*
*/
public class SearchResult2 implements Runnable {
public static int MAXCOUNT = 490;
public static int interleaving;
public static TaskOptimizer threadOpt;
public static TaskOptimizer pageOpt;
static String term1;
static {
term1 = "http://www.linkedin.com/pub/0/4";
}
final private static String pattern = "/search/cache?ipc=";
// final private static String patternNextPages =
// "/search?q=http://www.linkedin.com/pub/";
static int totalLinkCount = 0;
static int totalLinkInsertCount = 0;
static int recentLinkInsertCount = 0;
static int totalPageCount = 0;
static Date taskStart = new Date();
static Yahoo se = new Yahoo();
private Connection cnn;
long timeEplased;
private String page = "";
Set<String> links;
Set<String> links2;
Set<String> nextPagesLinks;
// to asign task by this number, whne used as single thread, set it to 1
private int threadID;
SearchResultData searchResultData;
/**
* @param args
* @throws SQLException
*/
public static void main(String[] args) throws SQLException {
// new SearchResult().run();
try {
new SearchResult2().parseATerm(1, term1);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public SearchResult2() throws SQLException {
this.interleaving = 1;
this.threadID = 0;
constructor();
}
public SearchResult2(int threadID) throws SQLException {
this.threadID = threadID;
constructor();
}
private void constructor() throws SQLException {
cnn = DataAccess.getNewConnection();
links = new HashSet<String>();
links2 = new HashSet<String>();
nextPagesLinks = new HashSet<String>();
searchResultData = new SearchResultData(cnn);
}
private int getMinID() throws SQLException {
PreparedStatement ps;
ResultSet rs;
ps = cnn
.prepareStatement("SELECT min(id)as minid FROM search_terms s where tag=0");
rs = ps.executeQuery();
if (rs.next())
return rs.getInt("minid");
else
return Integer.MAX_VALUE;
}
public void parseAllPages() {
PreparedStatement ps = null;
ResultSet rs;
try {
int low = 0, high = 0;
cnn.setAutoCommit(false);
low = getMinID();
high = (int) (low + Math.round(MAXCOUNT * 5));
ps = cnn.prepareStatement("SELECT s.id,s.term, s.tag "
+ "FROM search_terms s "
+ "where tag=0 and id%?=? and id between ? and ?");
ps.setInt(1, interleaving);
ps.setInt(2, threadID);
ps.setInt(3, low);
ps.setInt(4, high);
// low = Integer.MIN_VALUE;
// high = Integer.MAX_VALUE;
// ps = cnn.prepareStatement("SELECT s.id,s.term, s.tag "
// + "FROM search_terms s " + "where term like '" + term1
// + "'");
rs = ps.executeQuery();
while (rs.next() && (totalPageCount < MAXCOUNT)) {
parseATerm(rs.getInt("id"), rs.getString("term"));
cnn.commit();
//threadOpt.reportTask(true);
}
} catch (SQLException e) {
try {
cnn.rollback();
} catch (SQLException e1) {
e1.printStackTrace();
}
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
//if (e.getMessage().contains("999")){
//threadOpt.reportTask(false);
//pageOpt.reportTask(false);
//}
}
try {
cnn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
private void parseAPage(String pageLink) throws IOException {
// page = getARawPage(pageLink);
getAPage(pageLink, 20000);
// Util.saveAFile("c:\\temp3" + id + ".html", page);
if (page == "" || page == null)
return;
parseLinks();
}
private void parseATerm(int id, String term) throws SQLException,
IOException {
int insertCount = 0;
int i = 0;
links.clear();
links2.clear();
getAPage(se.quesryString() + term, 20000);
if (page == "" || page == null)
return;
parseLinks();
nextPagesLinks.clear();
parsenextPages(term, page);
for (String s : nextPagesLinks)
parseAPage(s);
decodeUrl();
insertCount = searchResultData.insert(links2, 1);
setStatus(id, 1);
// for (String l : links)
// System.out.println(l);
// System.out.println("--------------------------------Total links:" +
// links.size());
showTask(insertCount, term);
}
private void decodeUrl() {
for (String s : links) {
try {
s = s.replace("&", "&");
s = java.net.URLDecoder.decode(s, "utf-8");
links2.add(s);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
}
private void parsenextPages(String term, String page) {
String url;
String totalPageString;
int totalPage = 0;
int count = 1;
// Pages (813)
Pattern p = Pattern.compile("Pages \\([\\d\\,]{1,5}\\)");
Matcher m = p.matcher(page);
if (!m.find())
return;
totalPageString = m.group();
totalPage = Util.getNumberFromString(totalPageString.replace(",", ""));
while (count < totalPage) {
count += 100;
url = se.quesryString() + term + "&b=" + count
+ "&bwm=p&bwmo=&bwmf=";
nextPagesLinks.add(url);
}
}
private void showTask(int insertCount, String term) {
totalPageCount += 1;
totalLinkCount += links.size();
totalLinkInsertCount += insertCount;
recentLinkInsertCount += insertCount;
System.out.println("Links found by this term:" + term + ":"
+ links.size() + "/Links inserted:" + insertCount);
}
private void setStatus(int id, int i) throws SQLException {
PreparedStatement ps;
ps = cnn.prepareStatement("update search_terms set tag=? where id=?");
ps.setInt(1, i);
ps.setInt(2, id);
ps.execute();
}
public void parseLinks() {
org.htmlparser.Parser p = new Parser();
try {
p.setInputHTML(page);
} catch (ParserException e) {
e.printStackTrace();
}
Lexer l = p.getLexer();
org.htmlparser.nodes.AbstractNode n;
String link = "";
try {
while ((n = (AbstractNode) l.nextNode()) != null) {
if (n instanceof org.htmlparser.nodes.TagNode)
if ((link = ((org.htmlparser.nodes.TagNode) n)
.getAttribute("href")) != null)
if (link.contains(pattern))
links.add(link);
}
} catch (ParserException e) {
e.printStackTrace();
}
}
public void run() {
parseAllPages();
System.out.println("--------------------Thread "
+ Thread.currentThread().getName()
+ " Finished-----------------");
}
private void getAPage(String pageLink, int timeout) throws IOException {
Moderator.ensureInterval();
timeout = 10 * 60 * 1000;
URL url = null;
URLConnection urlCon = null;
BufferedReader br = null;
InputStreamReader isr;
String s = null;
StringBuilder sb = new StringBuilder();
int i;
final int cBufSize = 1000;
char[] cbuf = new char[cBufSize];
url = new URL(pageLink);
urlCon = (HttpURLConnection) url.openConnection();
urlCon.setRequestProperty("User-agent", "IE/6.0");
urlCon.setReadTimeout(timeout);
urlCon.setConnectTimeout(timeout);
isr = new InputStreamReader(urlCon.getInputStream());
br = new BufferedReader(isr);
while ((i = br.read(cbuf, 0, cBufSize)) > 0) {
sb.append(cbuf, 0, i);
}
page = sb.toString();
if (page.contains("<html") && page.contains("</html>")
&& !page.contains("<title>403 Forbidden</title>")
|| page.contains("<HTML") && page.contains("</HTML>")
&& !page.contains("<TITLE>403 Forbidden</TITLE>"))
return;
else {
throw new IOException(page);
}
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?