⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 profiledl.java

📁 利用多线程从搜索引擎下载网页并提取数据到数据库。
💻 JAVA
字号:
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.sql.CallableStatement;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;

public class ProfileDL implements Runnable {

	private Map urls;

	public static boolean stop = false;

	private boolean exitFlag = false;

	public static int alert;

	public static int maxCount = 500; // pages per batch, batch set default
	{
		// maxCount=Integer.MAX_VALUE;
	}

	// number. downloads exceeds that will
	// be blocked by google.
	private final static String milestoneLink = "http://www.linkedin.com/pub/0/1";

	private static int lowID;

	private static int highID;

	private Connection cnn;

	public static Connection cnn2;

	public static int totalPageCount = 0;

	public static int totalLinkCount = 0;

	public static int totalofTotal = 0;

	public static Date taskStart = new Date();

	// private ParseCachedLink parseLink;

	// to asign task by this number, whne used as single thread, set it to 1
	private int interleaving;

	private int threadID;

	private SearchResultData searchResultData;

	private Profile profile;

	static private long timeElapsed;

	private String page;

	public final static int startingID = 591730;

	URL url = null;

	HttpURLConnection httpUrlCon = null;

	BufferedReader br = null;

	InputStreamReader isr;

	StringBuilder sb = null;

	/**
	 * @param args
	 * @throws SQLException
	 */
	public static void main(String[] args) throws SQLException {
		new ProfileDL().run();
	}

	public ProfileDL() throws SQLException {
		this.interleaving = 1;
		this.threadID = 0;
		// constructor(); // delay to Run() to accelerate thread starting.

	}

	public ProfileDL(int threadID, int interleaving, Map urls)
			throws SQLException {
		this.interleaving = interleaving;
		this.threadID = threadID;
		this.urls = urls;
		// constructor(); // delay to Run() to accelerate thread starting.
	}

	private synchronized void constructor() throws SQLException {
		cnn = DataAccess.getNewConnection();
		searchResultData = new SearchResultData(cnn);
		cnn2 = DataAccess.getConnection();
	}

	private synchronized void parseAllPages() {
		// PreparedStatement ps;
		CallableStatement ps;
		ResultSet rs = null;
		/*
		 * String sqlStatement = "SELECT
		 * s.search_engine_id,s.id,s.profile_cached_url FROM search_results s " +
		 * "where profile_url like '" + milestoneLink + "/%' and download_tag<3
		 * and s.id between ? and ? and s.id%?=?";
		 */
		Date date = new Date();
		int id = 0;
		String url;
		Iterator it;
		Map.Entry<Integer, String> entry;
		try {
			// ps = cnn2.prepareCall("{ call getURLsInterleaving (?,?,?)}");
			// ps.setInt(1, (int) (maxCount / interleaving));
			// ps.setInt(2, interleaving);
			// ps.setInt(3, threadID);
			//
			// rs = ps.executeQuery();
			cnn.setAutoCommit(false);
			// rs.last();
			// System.out.println(Thread.currentThread().getName() + " Get "
			// + rs.getRow() + " rows spent time ms:"
			// + (new Date().getTime() - date.getTime()));
			// rs.beforeFirst();
			// while (!exitFlag && !stop && rs.next()
			// && ((totalLinkCount < maxCount)))
			it = urls.entrySet().iterator();
			while (it.hasNext() && !exitFlag && !stop
					&& totalLinkCount < maxCount) {
				entry = (Entry<Integer, String>) it.next();
				id = entry.getKey();
				url = entry.getValue();
				saveAPage(url, id);
				searchResultData.updateCacheUrlDownloadTagOK(3, id);
				cnn.commit();
			}
		} catch (SQLException e) {
			e.printStackTrace();
			// stop=true; //alert database disconnected.
			try {
				cnn.rollback();
			} catch (SQLException e1) {
				e1.printStackTrace();
			}
		} catch (IOException e) {
			e.printStackTrace();
			searchResultData.updateCacheUrlDownloadTagFailed(1, id);
			try {
				cnn.commit();
			} catch (SQLException e1) {
				e1.printStackTrace();
			}
			alert++; // alert other threads and deamon that seach engine
			// is blocking IP now
		} catch (Exception e) { // catch any other exceptions.
			e.printStackTrace();
			alert++;
		}

	}

	private synchronized void saveAPage(String pageLink, int urlID)
			throws IOException, SQLException {
		// System.out.println("getting page:" + urlID + "|" + pageLink);
		Date begin = new Date();
		if (!getAPage(pageLink))
			return;
		if (!trimPage())
			return;
		profile = new Profile(cnn, urlID, page);
		ParseProfile pp = new ParseProfile(profile);
		pp.parseAndSaveToDB();
		showTask();
	}

	private boolean trimPage() {
		int start;
		start = page.indexOf("<body");
		if (start < 0)
			return false;
		int end;
		end = page.indexOf("<div id=\"control\" class=\"infobar\">");
		if (end <= start)
			return false;
		page = page.substring(start, end);
		return true;
	}

	private void showTask() {
		totalPageCount += 1;
		totalLinkCount += 1;
		totalofTotal += 1;
		timeElapsed = new Date().getTime() - taskStart.getTime();
		if (totalLinkCount % 10 == 0) {
			System.out.print("Total Link Count:" + totalLinkCount);
			System.out.print("  /  Total Page Count:" + totalPageCount);
			System.out.println("  /  Total Time elapsed:" + timeElapsed / 1000
					+ " seconds");
			System.out.println();
		}
	}

	public synchronized void run() {

		/*
		 * try { Thread.sleep((long) (Util.getARand()*10000)); } catch
		 * (InterruptedException e1) { // TODO Auto-generated catch block
		 * e1.printStackTrace(); }
		 */

		try {
			constructor();
		} catch (SQLException e) {
			e.printStackTrace();
			return;
		}

		// System.out.println("cnn.toString()=" + cnn.toString());
		parseAllPages();
		closeCnn();

		System.out.println("-------------------------Thread:"
				+ Thread.currentThread().getName()
				+ " Finished-------------------------");

	}

	private boolean getAPage(String pageLink) throws IOException {

		Boolean result = false;
		int tryCount = 0;
		boolean pageResult = false;
		do {
			try {
				Moderator.ensureInterval();
				pageResult = getAPageSub(pageLink);
				result = true;
			} catch (IOException e) {
				if (e.getMessage().contains("999") || ++tryCount > 3)
					throw e;
				else {
					System.out.println(e.getMessage());
					System.out.println("Retrying");
				}
			}
		} while (!result && tryCount <= 3);

		return pageResult;
	}

	private synchronized boolean getAPageSub(String pageLink)
			throws IOException {

		url = null;
		httpUrlCon = null;
		br = null;
		isr = null;
		sb = new StringBuilder();
		int i;
		final int cBufSize = 1000;
		char[] cbuf = new char[cBufSize];
		url = new URL(pageLink);

		httpUrlCon = (HttpURLConnection) url.openConnection();
		httpUrlCon.setRequestProperty("User-agent", "IE/6.0");
		isr = new InputStreamReader(httpUrlCon.getInputStream());
		br = new BufferedReader(isr);
		// if (httpUrlCon.getResponseCode()!=HttpURLConnection.HTTP_ACCEPTED &&
		// httpUrlCon.getResponseCode()!=HttpURLConnection.HTTP_OK)
		// throw new IOException();
		while ((i = br.read(cbuf, 0, cBufSize)) > 0) {
			sb.append(cbuf, 0, i);
		}
		page = sb.toString();

		if (page.contains("<html") && page.contains("</html>")
				&& !page.contains("<title>403 Forbidden</title>")
				|| page.contains("<HTML") && page.contains("</HTML>")
				&& !page.contains("<TITLE>403 Forbidden</TITLE>"))
			return true;
		else {
			throw new IOException(page);
		}

	}

	public void finalize() {

		closeCnn();
	}

	private void closeCnn() {
		if (cnn != null) {
			try {
				if (!cnn.isClosed()) {
					cnn.rollback();
					cnn.close();
				}
			} catch (SQLException e1) {
				e1.printStackTrace();
			}
		}
	}

	public synchronized void exit(String threadName) {
		System.out.println("trying to close IOs and exit thread:" + threadName);

		exitFlag = true;
		try {
			if (httpUrlCon != null)
				httpUrlCon.disconnect();
		} catch (Exception e) {
		}

		try {
			if (isr != null)
				isr.close();
		} catch (Exception e) {// catch all exceptions.
			e.printStackTrace();
		}

		try {
			if (br != null)
				br.close();
		} catch (Exception e) {
		}
		closeCnn();

	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -