📄 searchresult1.java

📁 利用多线程从搜索引擎下载网页并提取数据到数据库。
💻 JAVA
字号:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.sql.CallableStatement;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.htmlparser.Parser;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.nodes.AbstractNode;
import org.htmlparser.util.ParserException;

/**
 * Read stored links from table search_result_prepare, scrape each page for
 * links to persons' profile on LinkIn.Com
 * 
 * @author james
 * 
 */
public class SearchResult1 implements Runnable {

	public static int MAXCOUNT = 490;
	public static int MAXSTPAGECOUNT = 50;
	public static int interleaving;
	public static TaskOptimizer threadOpt;
	public static TaskOptimizer pageOpt;

	public static int termID1 = 490;
	static String term1 = "http://www.linkedin.com/pub/1/551";

	final private static String pattern = "/search/cache?ipc=";
	// final private static String patternNextPages =
	// "/search?q=http://www.linkedin.com/pub/";

	static int totalLinkCount = 0;
	static int totalLinkInsertCount = 0;
	static int recentLinkInsertCount = 0;

	static int totalPageCount = 0;
	static int recentGoodPageCount = 0;

	static Date taskStart = new Date();
	static Yahoo se = new Yahoo();

	private Connection cnn;

	long timeEplased;
	private String page = "";
	Set<String> links;
	Set<String> links2;
	List<String> nextPagesLinks;

	// to asign task by this number, whne used as single thread, set it to 1
	private int threadID;

	SearchResultData searchResultData;
	private boolean getLinks;
	private int actualPageCount = 0;

	/**
	 * @param args
	 * @throws SQLException
	 */
	public static void main(String[] args) throws SQLException {
		// new SearchResult().run();
		try {
			new SearchResult1().parseATerm(termID1, term1);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	public SearchResult1() throws SQLException {
		this.interleaving = 1;
		this.threadID = 0;
		constructor();

	}

	public SearchResult1(int threadID) throws SQLException {
		this.threadID = threadID;
		constructor();
	}

	private void constructor() throws SQLException {
		cnn = DataAccess.getNewConnection();
		links = new HashSet<String>();
		links2 = new HashSet<String>();
		nextPagesLinks = new ArrayList<String>();
		searchResultData = new SearchResultData(cnn);
	}

	public boolean parseAllPages() throws IOException {
		CallableStatement ps = null;
		ResultSet rs;
		try {
			ps = cnn.prepareCall("{call getTermsInterleaving(?,?,?)}");
			ps.setInt(1, (int) (MAXCOUNT / interleaving));
			ps.setInt(2, interleaving);
			ps.setInt(3, threadID);

			rs = ps.executeQuery();
			rs.next();
			if (rs.getRow()==0)
				return false;
			rs.beforeFirst();
			while (rs.next() && (recentGoodPageCount < MAXCOUNT)
					&& actualPageCount < MAXSTPAGECOUNT) {
				parseATerm(rs.getInt("id"), rs.getString("term"));
				cnn.commit();
				// threadOpt.reportTask(true);
			}
			rs.close();
		} catch (SQLException e) {
			try {
				cnn.rollback();
			} catch (SQLException e1) {
				e1.printStackTrace();
			}
			e.printStackTrace();
		} catch (IOException e) {
			// System.out.println(e.getMessage());
			throw e;
			// if (e.getMessage().contains("999")){
			// threadOpt.reportTask(false);
			// pageOpt.reportTask(false);
			// }
		}
		return true;

	}

	private int parseAPage(String pageLink) throws IOException {
		// page = getARawPage(pageLink);
		getAPage(pageLink, 20000);
		// Util.saveAFile("c:\\temp3" + id + ".html", page);
		if (page == "" || page == null)
			return 0;
		else
			return parseLinks();
	}

	private void parseATerm(int id, String term) throws SQLException,
			IOException {
		int insertCount = 0;
		links.clear();
		links2.clear();
		getAPage(se.quesryString() + term, 20000);
		// System.out.println(page);
		if (page == "" || page == null)
			return;
		parseLinks();
		if (!getLinks)
			getLinks = links.size() > 0;
		nextPagesLinks.clear();
		parsenextPages(term, page);

		for (String s:nextPagesLinks){
			int linksCount = parseAPage(s);
			if (linksCount < 100)
				break;
		}

		decodeUrl();
		insertCount = searchResultData.insert(links2, 1);
		setStatus(id, 1);
		showTask(insertCount, term);
	}

	private void decodeUrl() {
		for (String s : links) {
			try {
				s = s.replace("&amp;", "&");
				s = java.net.URLDecoder.decode(s, "utf-8");
				links2.add(s);
			} catch (UnsupportedEncodingException e) {
				e.printStackTrace();
			}
		}
	}

	private void parsenextPages(String term, String page) {
		String url;
		String totalPageString;
		int totalPage = 0;
		int count = 1;
		// Pages (813)
		Pattern p = Pattern.compile("Pages \\([\\d\\,]{1,5}\\)");
		Matcher m = p.matcher(page);
		if (!m.find())
			return;

		totalPageString = m.group();
		totalPage = Util.getNumberFromString(totalPageString.replace(",", ""));
		while (count < totalPage) {
			count += 100;
			url = se.quesryString() + term + "&b=" + count
					+ "&bwm=p&bwmo=&bwmf=";
			nextPagesLinks.add(url);
		}
	}

	private void showTask(int insertCount, String term) {

		totalLinkCount += links.size();
		totalLinkInsertCount += insertCount;
		recentLinkInsertCount += insertCount;
		if (links.size() > 0) {
			recentGoodPageCount += 1;
			totalPageCount += 1;
		}

		System.out.println("Links found by this term:" + term + ":"
				+ links.size() + "/Links inserted:" + insertCount);
	}

	private void setStatus(int id, int i) throws SQLException {
		PreparedStatement ps;
		ps = cnn.prepareStatement("update search_terms set tag=? where id=?");
		ps.setInt(1, i);
		ps.setInt(2, id);
		ps.execute();
	}

	public int parseLinks() {

		int count = 0;
		org.htmlparser.Parser p = new Parser();
		try {
			p.setInputHTML(page);
		} catch (ParserException e) {
			e.printStackTrace();
		}
		Lexer l = p.getLexer();
		org.htmlparser.nodes.AbstractNode n;
		String link = "";
		try {
			while ((n = (AbstractNode) l.nextNode()) != null) {
				if (n instanceof org.htmlparser.nodes.TagNode)
					if ((link = ((org.htmlparser.nodes.TagNode) n)
							.getAttribute("href")) != null)
						if (link.contains(pattern)) {
							links.add(link);
							count++;
						}
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}

		return count;
	}

	public void run() {

		getLinks = false;

		int count = 0;
		try {
			cnn.setAutoCommit(false);
		} catch (SQLException e1) {
			System.out.println(e1.getMessage());
		}
		boolean taskResult=true;
		try {
			while (taskResult && (recentGoodPageCount < MAXCOUNT)
					&& actualPageCount < MAXSTPAGECOUNT) {
				 taskResult=parseAllPages();
				System.out.println("Thread " + Thread.currentThread().getName()
						+ " Looping  count:" + count++);
			}
		} catch (IOException e) {
			System.out.println(e.getMessage());
		}

		try {
			cnn.close();
		} catch (SQLException e) {
			e.printStackTrace();
		}

		System.out.println("--------------------Thread "
				+ Thread.currentThread().getName()
				+ " Finished-----------------");

	}

	/**
	 * if 999 error, throw the error. if other error, try again.
	 * 
	 * @param pageLink
	 * @param timeout
	 * @throws IOException
	 */
	private void getAPage(String pageLink, int timeout) throws IOException {

		Boolean result = false;
		timeout = 30000;
		int tryCount = 0;
		do {
			try {
				Moderator.ensureInterval();
				getAPageSub(pageLink, timeout);
				result = true;
			} catch (IOException e) {
				if (e.getMessage().contains("999") || ++tryCount > 3)
					throw e;
				else {
					System.out.println(e.getMessage());
					System.out.println("Retrying");
				}
			}
		} while (!result && tryCount <= 3);

		actualPageCount++;
	}

	private void getAPageSub(String pageLink, int timeout) throws IOException {

		//
		URL url = null;
		URLConnection urlCon = null;
		BufferedReader br = null;
		InputStreamReader isr;
		page = null;
		StringBuilder sb = new StringBuilder();
		int i;
		final int cBufSize = 1000;
		char[] cbuf = new char[cBufSize];
		url = new URL(pageLink);

		urlCon = (HttpURLConnection) url.openConnection();
		urlCon.setRequestProperty("User-agent", "IE/6.0");
		urlCon.setReadTimeout(timeout);
		urlCon.setConnectTimeout(timeout);

		isr = new InputStreamReader(urlCon.getInputStream());
		br = new BufferedReader(isr);

		while ((i = br.read(cbuf, 0, cBufSize)) > 0) {
			sb.append(cbuf, 0, i);
		}
		page = sb.toString();

		if (page.contains("<html") && page.contains("</html>")
				&& !page.contains("<title>403 Forbidden</title>")
				|| page.contains("<HTML") && page.contains("</HTML>")
				&& !page.contains("<TITLE>403 Forbidden</TITLE>"))
			return;
		else {
			throw new IOException(page);
		}
	}

}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -