searchresult2.java

来自「利用多线程从搜索引擎下载网页并提取数据到数据库。」· Java 代码 · 共 329 行

JAVA
329
字号
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.htmlparser.Parser;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.nodes.AbstractNode;
import org.htmlparser.util.ParserException;

/**
 * Read stored links from table search_result_prepare, scrape each page for
 * links to persons' profile on LinkIn.Com
 * 
 * @author james
 * 
 */
public class SearchResult2 implements Runnable {

	public static int MAXCOUNT = 490;
	public static int interleaving;
	public static TaskOptimizer threadOpt;
	public static TaskOptimizer pageOpt;

	static String term1;
	static {
		term1 = "http://www.linkedin.com/pub/0/4";
	}
	final private static String pattern = "/search/cache?ipc=";
	// final private static String patternNextPages =
	// "/search?q=http://www.linkedin.com/pub/";

	static int totalLinkCount = 0;
	static int totalLinkInsertCount = 0;
	static int recentLinkInsertCount = 0;

	static int totalPageCount = 0;

	static Date taskStart = new Date();
	static Yahoo se = new Yahoo();

	private Connection cnn;

	long timeEplased;
	private String page = "";
	Set<String> links;
	Set<String> links2;
	Set<String> nextPagesLinks;

	// to asign task by this number, whne used as single thread, set it to 1
	private int threadID;

	SearchResultData searchResultData;

	/**
	 * @param args
	 * @throws SQLException
	 */
	public static void main(String[] args) throws SQLException {
		// new SearchResult().run();
		try {
			new SearchResult2().parseATerm(1, term1);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	public SearchResult2() throws SQLException {
		this.interleaving = 1;
		this.threadID = 0;
		constructor();

	}

	public SearchResult2(int threadID) throws SQLException {
		this.threadID = threadID;
		constructor();
	}

	private void constructor() throws SQLException {
		cnn = DataAccess.getNewConnection();
		links = new HashSet<String>();
		links2 = new HashSet<String>();
		nextPagesLinks = new HashSet<String>();
		searchResultData = new SearchResultData(cnn);
	}

	private int getMinID() throws SQLException {
		PreparedStatement ps;
		ResultSet rs;
		ps = cnn
				.prepareStatement("SELECT min(id)as minid FROM search_terms s where tag=0");
		rs = ps.executeQuery();
		if (rs.next())
			return rs.getInt("minid");
		else
			return Integer.MAX_VALUE;
	}

	public void parseAllPages() {
		PreparedStatement ps = null;
		ResultSet rs;
		try {
			int low = 0, high = 0;
			cnn.setAutoCommit(false);

			low = getMinID();
			high = (int) (low + Math.round(MAXCOUNT * 5));
			ps = cnn.prepareStatement("SELECT s.id,s.term, s.tag "
					+ "FROM search_terms s "
					+ "where tag=0 and id%?=? and id between ? and ?");
			ps.setInt(1, interleaving);
			ps.setInt(2, threadID);
			ps.setInt(3, low);
			ps.setInt(4, high);

			// low = Integer.MIN_VALUE;
			// high = Integer.MAX_VALUE;
			// ps = cnn.prepareStatement("SELECT s.id,s.term, s.tag "
			// + "FROM search_terms s " + "where term like '" + term1
			// + "'");

			rs = ps.executeQuery();
			while (rs.next() && (totalPageCount < MAXCOUNT)) {
				parseATerm(rs.getInt("id"), rs.getString("term"));
				cnn.commit();
				//threadOpt.reportTask(true);
			}
		} catch (SQLException e) {
			try {
				cnn.rollback();
			} catch (SQLException e1) {
				e1.printStackTrace();
			}
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
			//if (e.getMessage().contains("999")){
			//threadOpt.reportTask(false);
			//pageOpt.reportTask(false);
			//}
		}

		try {
			cnn.close();
		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	private void parseAPage(String pageLink) throws IOException {
		// page = getARawPage(pageLink);
		getAPage(pageLink, 20000);
		// Util.saveAFile("c:\\temp3" + id + ".html", page);
		if (page == "" || page == null)
			return;
		parseLinks();
	}

	private void parseATerm(int id, String term) throws SQLException,
			IOException {
		int insertCount = 0;
		int i = 0;
		links.clear();
		links2.clear();
		getAPage(se.quesryString() + term, 20000);
		if (page == "" || page == null)
			return;
		parseLinks();

		nextPagesLinks.clear();
		parsenextPages(term, page);
		for (String s : nextPagesLinks)
			parseAPage(s);

		decodeUrl();
		insertCount = searchResultData.insert(links2, 1);
		setStatus(id, 1);

		// for (String l : links)
		// System.out.println(l);

		// System.out.println("--------------------------------Total links:" +
		// links.size());
		showTask(insertCount, term);
	}

	private void decodeUrl() {
		for (String s : links) {
			try {
				s = s.replace("&amp;", "&");
				s = java.net.URLDecoder.decode(s, "utf-8");
				links2.add(s);
			} catch (UnsupportedEncodingException e) {
				e.printStackTrace();
			}
		}
	}

	private void parsenextPages(String term, String page) {
		String url;
		String totalPageString;
		int totalPage = 0;
		int count = 1;
		// Pages (813)
		Pattern p = Pattern.compile("Pages \\([\\d\\,]{1,5}\\)");
		Matcher m = p.matcher(page);
		if (!m.find())
			return;

		totalPageString = m.group();
		totalPage = Util.getNumberFromString(totalPageString.replace(",", ""));
		while (count < totalPage) {
			count += 100;
			url = se.quesryString() + term + "&b=" + count
					+ "&bwm=p&bwmo=&bwmf=";
			nextPagesLinks.add(url);
		}
	}

	private void showTask(int insertCount, String term) {

		totalPageCount += 1;
		totalLinkCount += links.size();
		totalLinkInsertCount += insertCount;
		recentLinkInsertCount += insertCount;
		
		System.out.println("Links found by this term:" + term + ":"
				+ links.size() + "/Links inserted:" + insertCount);
	}

	private void setStatus(int id, int i) throws SQLException {
		PreparedStatement ps;
		ps = cnn.prepareStatement("update search_terms set tag=? where id=?");
		ps.setInt(1, i);
		ps.setInt(2, id);
		ps.execute();
	}

	public void parseLinks() {

		org.htmlparser.Parser p = new Parser();
		try {
			p.setInputHTML(page);
		} catch (ParserException e) {
			e.printStackTrace();
		}
		Lexer l = p.getLexer();
		org.htmlparser.nodes.AbstractNode n;
		String link = "";
		try {
			while ((n = (AbstractNode) l.nextNode()) != null) {
				if (n instanceof org.htmlparser.nodes.TagNode)
					if ((link = ((org.htmlparser.nodes.TagNode) n)
							.getAttribute("href")) != null)
						if (link.contains(pattern))
							links.add(link);
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
	}

	public void run() {

		parseAllPages();
		System.out.println("--------------------Thread "
				+ Thread.currentThread().getName()
				+ " Finished-----------------");

	}

	private void getAPage(String pageLink, int timeout) throws IOException {
		
		
		Moderator.ensureInterval();

		timeout = 10 * 60 * 1000;
		URL url = null;
		URLConnection urlCon = null;
		BufferedReader br = null;
		InputStreamReader isr;
		String s = null;
		StringBuilder sb = new StringBuilder();
		int i;
		final int cBufSize = 1000;
		char[] cbuf = new char[cBufSize];
		url = new URL(pageLink);

		urlCon = (HttpURLConnection) url.openConnection();
		urlCon.setRequestProperty("User-agent", "IE/6.0");
		urlCon.setReadTimeout(timeout);
		urlCon.setConnectTimeout(timeout);

		isr = new InputStreamReader(urlCon.getInputStream());
		br = new BufferedReader(isr);

		while ((i = br.read(cbuf, 0, cBufSize)) > 0) {
			sb.append(cbuf, 0, i);
		}
		page = sb.toString();

		if (page.contains("<html") && page.contains("</html>")
				&& !page.contains("<title>403 Forbidden</title>")
				|| page.contains("<HTML") && page.contains("</HTML>")
				&& !page.contains("<TITLE>403 Forbidden</TITLE>"))
			return;
		else {
			throw new IOException(page);
		}
	}

}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?