📄 searchresult1.java
字号:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.sql.CallableStatement;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.Parser;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.nodes.AbstractNode;
import org.htmlparser.util.ParserException;
/**
* Read stored links from table search_result_prepare, scrape each page for
* links to persons' profile on LinkIn.Com
*
* @author james
*
*/
public class SearchResult1 implements Runnable {
public static int MAXCOUNT = 490;
public static int MAXSTPAGECOUNT = 50;
public static int interleaving;
public static TaskOptimizer threadOpt;
public static TaskOptimizer pageOpt;
public static int termID1 = 490;
static String term1 = "http://www.linkedin.com/pub/1/551";
final private static String pattern = "/search/cache?ipc=";
// final private static String patternNextPages =
// "/search?q=http://www.linkedin.com/pub/";
static int totalLinkCount = 0;
static int totalLinkInsertCount = 0;
static int recentLinkInsertCount = 0;
static int totalPageCount = 0;
static int recentGoodPageCount = 0;
static Date taskStart = new Date();
static Yahoo se = new Yahoo();
private Connection cnn;
long timeEplased;
private String page = "";
Set<String> links;
Set<String> links2;
List<String> nextPagesLinks;
// to asign task by this number, whne used as single thread, set it to 1
private int threadID;
SearchResultData searchResultData;
private boolean getLinks;
private int actualPageCount = 0;
/**
* @param args
* @throws SQLException
*/
public static void main(String[] args) throws SQLException {
// new SearchResult().run();
try {
new SearchResult1().parseATerm(termID1, term1);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public SearchResult1() throws SQLException {
this.interleaving = 1;
this.threadID = 0;
constructor();
}
public SearchResult1(int threadID) throws SQLException {
this.threadID = threadID;
constructor();
}
private void constructor() throws SQLException {
cnn = DataAccess.getNewConnection();
links = new HashSet<String>();
links2 = new HashSet<String>();
nextPagesLinks = new ArrayList<String>();
searchResultData = new SearchResultData(cnn);
}
public boolean parseAllPages() throws IOException {
CallableStatement ps = null;
ResultSet rs;
try {
ps = cnn.prepareCall("{call getTermsInterleaving(?,?,?)}");
ps.setInt(1, (int) (MAXCOUNT / interleaving));
ps.setInt(2, interleaving);
ps.setInt(3, threadID);
rs = ps.executeQuery();
rs.next();
if (rs.getRow()==0)
return false;
rs.beforeFirst();
while (rs.next() && (recentGoodPageCount < MAXCOUNT)
&& actualPageCount < MAXSTPAGECOUNT) {
parseATerm(rs.getInt("id"), rs.getString("term"));
cnn.commit();
// threadOpt.reportTask(true);
}
rs.close();
} catch (SQLException e) {
try {
cnn.rollback();
} catch (SQLException e1) {
e1.printStackTrace();
}
e.printStackTrace();
} catch (IOException e) {
// System.out.println(e.getMessage());
throw e;
// if (e.getMessage().contains("999")){
// threadOpt.reportTask(false);
// pageOpt.reportTask(false);
// }
}
return true;
}
private int parseAPage(String pageLink) throws IOException {
// page = getARawPage(pageLink);
getAPage(pageLink, 20000);
// Util.saveAFile("c:\\temp3" + id + ".html", page);
if (page == "" || page == null)
return 0;
else
return parseLinks();
}
private void parseATerm(int id, String term) throws SQLException,
IOException {
int insertCount = 0;
links.clear();
links2.clear();
getAPage(se.quesryString() + term, 20000);
// System.out.println(page);
if (page == "" || page == null)
return;
parseLinks();
if (!getLinks)
getLinks = links.size() > 0;
nextPagesLinks.clear();
parsenextPages(term, page);
for (String s:nextPagesLinks){
int linksCount = parseAPage(s);
if (linksCount < 100)
break;
}
decodeUrl();
insertCount = searchResultData.insert(links2, 1);
setStatus(id, 1);
showTask(insertCount, term);
}
private void decodeUrl() {
for (String s : links) {
try {
s = s.replace("&", "&");
s = java.net.URLDecoder.decode(s, "utf-8");
links2.add(s);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
}
private void parsenextPages(String term, String page) {
String url;
String totalPageString;
int totalPage = 0;
int count = 1;
// Pages (813)
Pattern p = Pattern.compile("Pages \\([\\d\\,]{1,5}\\)");
Matcher m = p.matcher(page);
if (!m.find())
return;
totalPageString = m.group();
totalPage = Util.getNumberFromString(totalPageString.replace(",", ""));
while (count < totalPage) {
count += 100;
url = se.quesryString() + term + "&b=" + count
+ "&bwm=p&bwmo=&bwmf=";
nextPagesLinks.add(url);
}
}
private void showTask(int insertCount, String term) {
totalLinkCount += links.size();
totalLinkInsertCount += insertCount;
recentLinkInsertCount += insertCount;
if (links.size() > 0) {
recentGoodPageCount += 1;
totalPageCount += 1;
}
System.out.println("Links found by this term:" + term + ":"
+ links.size() + "/Links inserted:" + insertCount);
}
private void setStatus(int id, int i) throws SQLException {
PreparedStatement ps;
ps = cnn.prepareStatement("update search_terms set tag=? where id=?");
ps.setInt(1, i);
ps.setInt(2, id);
ps.execute();
}
public int parseLinks() {
int count = 0;
org.htmlparser.Parser p = new Parser();
try {
p.setInputHTML(page);
} catch (ParserException e) {
e.printStackTrace();
}
Lexer l = p.getLexer();
org.htmlparser.nodes.AbstractNode n;
String link = "";
try {
while ((n = (AbstractNode) l.nextNode()) != null) {
if (n instanceof org.htmlparser.nodes.TagNode)
if ((link = ((org.htmlparser.nodes.TagNode) n)
.getAttribute("href")) != null)
if (link.contains(pattern)) {
links.add(link);
count++;
}
}
} catch (ParserException e) {
e.printStackTrace();
}
return count;
}
public void run() {
getLinks = false;
int count = 0;
try {
cnn.setAutoCommit(false);
} catch (SQLException e1) {
System.out.println(e1.getMessage());
}
boolean taskResult=true;
try {
while (taskResult && (recentGoodPageCount < MAXCOUNT)
&& actualPageCount < MAXSTPAGECOUNT) {
taskResult=parseAllPages();
System.out.println("Thread " + Thread.currentThread().getName()
+ " Looping count:" + count++);
}
} catch (IOException e) {
System.out.println(e.getMessage());
}
try {
cnn.close();
} catch (SQLException e) {
e.printStackTrace();
}
System.out.println("--------------------Thread "
+ Thread.currentThread().getName()
+ " Finished-----------------");
}
/**
* if 999 error, throw the error. if other error, try again.
*
* @param pageLink
* @param timeout
* @throws IOException
*/
private void getAPage(String pageLink, int timeout) throws IOException {
Boolean result = false;
timeout = 30000;
int tryCount = 0;
do {
try {
Moderator.ensureInterval();
getAPageSub(pageLink, timeout);
result = true;
} catch (IOException e) {
if (e.getMessage().contains("999") || ++tryCount > 3)
throw e;
else {
System.out.println(e.getMessage());
System.out.println("Retrying");
}
}
} while (!result && tryCount <= 3);
actualPageCount++;
}
private void getAPageSub(String pageLink, int timeout) throws IOException {
//
URL url = null;
URLConnection urlCon = null;
BufferedReader br = null;
InputStreamReader isr;
page = null;
StringBuilder sb = new StringBuilder();
int i;
final int cBufSize = 1000;
char[] cbuf = new char[cBufSize];
url = new URL(pageLink);
urlCon = (HttpURLConnection) url.openConnection();
urlCon.setRequestProperty("User-agent", "IE/6.0");
urlCon.setReadTimeout(timeout);
urlCon.setConnectTimeout(timeout);
isr = new InputStreamReader(urlCon.getInputStream());
br = new BufferedReader(isr);
while ((i = br.read(cbuf, 0, cBufSize)) > 0) {
sb.append(cbuf, 0, i);
}
page = sb.toString();
if (page.contains("<html") && page.contains("</html>")
&& !page.contains("<title>403 Forbidden</title>")
|| page.contains("<HTML") && page.contains("</HTML>")
&& !page.contains("<TITLE>403 Forbidden</TITLE>"))
return;
else {
throw new IOException(page);
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -