📄 searchresult.java
字号:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import org.htmlparser.Parser;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.nodes.AbstractNode;
import org.htmlparser.util.ParserException;
/**
* Read stored links from table search_result_prepare, scrape each page for
* links to persons' profile on LinkIn.Com
*
* @author james
*
*/
public class SearchResult implements Runnable {
public static int MAXCOUNT = 490;
String term1;
{
term1 = "http://www.linkedin.com/pub/_/0";
term1 = "";
}
final private static String pattern = "search?q=cache:";
final private static String patternNextPages = "/search?q=http://www.linkedin.com/pub/";
static int totalLinkCount = 0;
static int totalLinkInsertCount = 0;
static int recentLinkInsertCount = 0;
static int totalPageCount = 0;
static Date taskStart = new Date();
static Google se = new Google();
private Connection cnn;
long timeEplased;
private String page = "";
Set<String> links;
Set<String> nextPagesLinks;
// to asign task by this number, whne used as single thread, set it to 1
private int interleaving;
private int threadID;
SearchResultData searchResultData;
/**
* @param args
* @throws SQLException
*/
public static void main(String[] args) throws SQLException {
new SearchResult().run();
}
public SearchResult() throws SQLException {
this.interleaving = 1;
this.threadID = 0;
constructor();
}
public SearchResult(int threadID, int interleaving) throws SQLException {
this.interleaving = interleaving;
this.threadID = threadID;
constructor();
}
private void constructor() throws SQLException {
cnn = DataAccess.getNewConnection();
links = new HashSet<String>();
nextPagesLinks = new HashSet<String>();
searchResultData = new SearchResultData(cnn);
}
private int getMinID() throws SQLException {
PreparedStatement ps;
ResultSet rs;
ps = cnn
.prepareStatement("SELECT min(id)as minid FROM search_terms s where tag=0");
rs = ps.executeQuery();
if (rs.next())
return rs.getInt("minid");
else
return Integer.MAX_VALUE;
}
public void parseAllPages() {
PreparedStatement ps = null;
ResultSet rs;
try {
int low = 0, high = 0;
cnn.setAutoCommit(false);
if (term1.equals("")) {
low = getMinID();
high = (int) (low + Math.round(MAXCOUNT * 1.5));
ps = cnn.prepareStatement("SELECT s.id,s.term, s.tag "
+ "FROM search_terms s "
+ "where tag=0 and id%?=? and id between ? and ?");
ps.setInt(1, interleaving);
ps.setInt(2, threadID);
ps.setInt(3, low);
ps.setInt(4, high);
} else {
low = Integer.MIN_VALUE;
high = Integer.MAX_VALUE;
ps = cnn.prepareStatement("SELECT s.id,s.term, s.tag "
+ "FROM search_terms s " + "where term like '" + term1
+ "'");
}
rs = ps.executeQuery();
while (rs.next() && (totalPageCount < MAXCOUNT)) {
parseATerm(rs.getInt("id"), rs.getString("term"));
cnn.commit();
}
} catch (SQLException e) {
try {
cnn.rollback();
} catch (SQLException e1) {
e1.printStackTrace();
}
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
try {
cnn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
private void parseAPage(String pageLink) throws IOException {
// page = getARawPage(pageLink);
page = Util.getAPage(pageLink,10000);
// Util.saveAFile("c:\\temp3" + id + ".html", page);
if (page == "" || page == null)
return;
parseLinks(pattern, links);
}
private void parseATerm(int id, String term) throws SQLException,
IOException {
int insertCount = 0;
int i = 0;
links.clear();
nextPagesLinks.clear();
parseAPage(se.quesryString() + term + se.quesryStringAppend());
parseLinks(patternNextPages, nextPagesLinks);
for (String s : nextPagesLinks)
parseAPage(se.host() + s);
insertCount = searchResultData.insert(links,2);
setStatus(id, 1);
showTask(insertCount, term);
}
private void showTask(int insertCount, String term) {
totalPageCount += 1;
totalLinkCount += links.size();
totalLinkInsertCount += insertCount;
recentLinkInsertCount += insertCount;
System.out.println("Links found by this term:" + term + ":"
+ links.size() + "/Links inserted:" + insertCount);
timeEplased = new Date().getTime() - taskStart.getTime();
System.out.println("Total terms count:" + totalPageCount
+ "/Total links found:" + totalLinkCount
+ "/ Total link insert:" + totalLinkInsertCount);
System.out.println("Recent links inserted:" + recentLinkInsertCount
+ ". /Time elapsed:" + timeEplased / 60000 + " minutes");
if (totalPageCount % 10 == 0)
recentLinkInsertCount = 0;
}
private void setStatus(int id, int i) throws SQLException {
PreparedStatement ps;
ps = cnn.prepareStatement("update search_terms set tag=? where id=?");
ps.setInt(1, i);
ps.setInt(2, id);
ps.execute();
}
public void parseLinks(String pattern, Set<String> set) {
org.htmlparser.Parser p = new Parser();
try {
p.setInputHTML(page);
} catch (ParserException e) {
e.printStackTrace();
}
Lexer l = p.getLexer();
org.htmlparser.nodes.AbstractNode n;
String link = "";
try {
while ((n = (AbstractNode) l.nextNode()) != null) {
if (n instanceof org.htmlparser.nodes.TagNode)
if ((link = ((org.htmlparser.nodes.TagNode) n)
.getAttribute("href")) != null)
if (link.contains(pattern))
set.add(link);
}
} catch (ParserException e) {
e.printStackTrace();
}
}
public synchronized void run() {
parseAllPages();
System.out.println("--------------------Thread "
+ Thread.currentThread().getName()
+ " Finished-----------------");
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -