📄 profiledl.java
字号:
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.sql.CallableStatement;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
public class ProfileDL implements Runnable {
private Map urls;
public static boolean stop = false;
private boolean exitFlag = false;
public static int alert;
public static int maxCount = 500; // pages per batch, batch set default
{
// maxCount=Integer.MAX_VALUE;
}
// number. downloads exceeds that will
// be blocked by google.
private final static String milestoneLink = "http://www.linkedin.com/pub/0/1";
private static int lowID;
private static int highID;
private Connection cnn;
public static Connection cnn2;
public static int totalPageCount = 0;
public static int totalLinkCount = 0;
public static int totalofTotal = 0;
public static Date taskStart = new Date();
// private ParseCachedLink parseLink;
// to asign task by this number, whne used as single thread, set it to 1
private int interleaving;
private int threadID;
private SearchResultData searchResultData;
private Profile profile;
static private long timeElapsed;
private String page;
public final static int startingID = 591730;
URL url = null;
HttpURLConnection httpUrlCon = null;
BufferedReader br = null;
InputStreamReader isr;
StringBuilder sb = null;
/**
* @param args
* @throws SQLException
*/
public static void main(String[] args) throws SQLException {
new ProfileDL().run();
}
public ProfileDL() throws SQLException {
this.interleaving = 1;
this.threadID = 0;
// constructor(); // delay to Run() to accelerate thread starting.
}
public ProfileDL(int threadID, int interleaving, Map urls)
throws SQLException {
this.interleaving = interleaving;
this.threadID = threadID;
this.urls = urls;
// constructor(); // delay to Run() to accelerate thread starting.
}
private synchronized void constructor() throws SQLException {
cnn = DataAccess.getNewConnection();
searchResultData = new SearchResultData(cnn);
cnn2 = DataAccess.getConnection();
}
private synchronized void parseAllPages() {
// PreparedStatement ps;
CallableStatement ps;
ResultSet rs = null;
/*
* String sqlStatement = "SELECT
* s.search_engine_id,s.id,s.profile_cached_url FROM search_results s " +
* "where profile_url like '" + milestoneLink + "/%' and download_tag<3
* and s.id between ? and ? and s.id%?=?";
*/
Date date = new Date();
int id = 0;
String url;
Iterator it;
Map.Entry<Integer, String> entry;
try {
// ps = cnn2.prepareCall("{ call getURLsInterleaving (?,?,?)}");
// ps.setInt(1, (int) (maxCount / interleaving));
// ps.setInt(2, interleaving);
// ps.setInt(3, threadID);
//
// rs = ps.executeQuery();
cnn.setAutoCommit(false);
// rs.last();
// System.out.println(Thread.currentThread().getName() + " Get "
// + rs.getRow() + " rows spent time ms:"
// + (new Date().getTime() - date.getTime()));
// rs.beforeFirst();
// while (!exitFlag && !stop && rs.next()
// && ((totalLinkCount < maxCount)))
it = urls.entrySet().iterator();
while (it.hasNext() && !exitFlag && !stop
&& totalLinkCount < maxCount) {
entry = (Entry<Integer, String>) it.next();
id = entry.getKey();
url = entry.getValue();
saveAPage(url, id);
searchResultData.updateCacheUrlDownloadTagOK(3, id);
cnn.commit();
}
} catch (SQLException e) {
e.printStackTrace();
// stop=true; //alert database disconnected.
try {
cnn.rollback();
} catch (SQLException e1) {
e1.printStackTrace();
}
} catch (IOException e) {
e.printStackTrace();
searchResultData.updateCacheUrlDownloadTagFailed(1, id);
try {
cnn.commit();
} catch (SQLException e1) {
e1.printStackTrace();
}
alert++; // alert other threads and deamon that seach engine
// is blocking IP now
} catch (Exception e) { // catch any other exceptions.
e.printStackTrace();
alert++;
}
}
private synchronized void saveAPage(String pageLink, int urlID)
throws IOException, SQLException {
// System.out.println("getting page:" + urlID + "|" + pageLink);
Date begin = new Date();
if (!getAPage(pageLink))
return;
if (!trimPage())
return;
profile = new Profile(cnn, urlID, page);
ParseProfile pp = new ParseProfile(profile);
pp.parseAndSaveToDB();
showTask();
}
private boolean trimPage() {
int start;
start = page.indexOf("<body");
if (start < 0)
return false;
int end;
end = page.indexOf("<div id=\"control\" class=\"infobar\">");
if (end <= start)
return false;
page = page.substring(start, end);
return true;
}
private void showTask() {
totalPageCount += 1;
totalLinkCount += 1;
totalofTotal += 1;
timeElapsed = new Date().getTime() - taskStart.getTime();
if (totalLinkCount % 10 == 0) {
System.out.print("Total Link Count:" + totalLinkCount);
System.out.print(" / Total Page Count:" + totalPageCount);
System.out.println(" / Total Time elapsed:" + timeElapsed / 1000
+ " seconds");
System.out.println();
}
}
public synchronized void run() {
/*
* try { Thread.sleep((long) (Util.getARand()*10000)); } catch
* (InterruptedException e1) { // TODO Auto-generated catch block
* e1.printStackTrace(); }
*/
try {
constructor();
} catch (SQLException e) {
e.printStackTrace();
return;
}
// System.out.println("cnn.toString()=" + cnn.toString());
parseAllPages();
closeCnn();
System.out.println("-------------------------Thread:"
+ Thread.currentThread().getName()
+ " Finished-------------------------");
}
private boolean getAPage(String pageLink) throws IOException {
Boolean result = false;
int tryCount = 0;
boolean pageResult = false;
do {
try {
Moderator.ensureInterval();
pageResult = getAPageSub(pageLink);
result = true;
} catch (IOException e) {
if (e.getMessage().contains("999") || ++tryCount > 3)
throw e;
else {
System.out.println(e.getMessage());
System.out.println("Retrying");
}
}
} while (!result && tryCount <= 3);
return pageResult;
}
private synchronized boolean getAPageSub(String pageLink)
throws IOException {
url = null;
httpUrlCon = null;
br = null;
isr = null;
sb = new StringBuilder();
int i;
final int cBufSize = 1000;
char[] cbuf = new char[cBufSize];
url = new URL(pageLink);
httpUrlCon = (HttpURLConnection) url.openConnection();
httpUrlCon.setRequestProperty("User-agent", "IE/6.0");
isr = new InputStreamReader(httpUrlCon.getInputStream());
br = new BufferedReader(isr);
// if (httpUrlCon.getResponseCode()!=HttpURLConnection.HTTP_ACCEPTED &&
// httpUrlCon.getResponseCode()!=HttpURLConnection.HTTP_OK)
// throw new IOException();
while ((i = br.read(cbuf, 0, cBufSize)) > 0) {
sb.append(cbuf, 0, i);
}
page = sb.toString();
if (page.contains("<html") && page.contains("</html>")
&& !page.contains("<title>403 Forbidden</title>")
|| page.contains("<HTML") && page.contains("</HTML>")
&& !page.contains("<TITLE>403 Forbidden</TITLE>"))
return true;
else {
throw new IOException(page);
}
}
public void finalize() {
closeCnn();
}
private void closeCnn() {
if (cnn != null) {
try {
if (!cnn.isClosed()) {
cnn.rollback();
cnn.close();
}
} catch (SQLException e1) {
e1.printStackTrace();
}
}
}
public synchronized void exit(String threadName) {
System.out.println("trying to close IOs and exit thread:" + threadName);
exitFlag = true;
try {
if (httpUrlCon != null)
httpUrlCon.disconnect();
} catch (Exception e) {
}
try {
if (isr != null)
isr.close();
} catch (Exception e) {// catch all exceptions.
e.printStackTrace();
}
try {
if (br != null)
br.close();
} catch (Exception e) {
}
closeCnn();
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -