⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 searchresult.java

📁 利用多线程从搜索引擎下载网页并提取数据到数据库。
💻 JAVA
字号:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;

import org.htmlparser.Parser;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.nodes.AbstractNode;
import org.htmlparser.util.ParserException;

/**
 * Read stored links from table search_result_prepare, scrape each page for
 * links to persons' profile on LinkIn.Com
 * 
 * @author james
 * 
 */
public class SearchResult implements Runnable {

	 public static int MAXCOUNT = 490;
	String term1;
	{
		term1 = "http://www.linkedin.com/pub/_/0";
		term1 = "";
	}
	final private static String pattern = "search?q=cache:";
	final private static String patternNextPages = "/search?q=http://www.linkedin.com/pub/";

	static int totalLinkCount = 0;
	static int totalLinkInsertCount = 0;
	static int recentLinkInsertCount = 0;

	static int totalPageCount = 0;

	static Date taskStart = new Date();
	static Google se = new Google();

	private Connection cnn;

	long timeEplased;
	private String page = "";
	Set<String> links;
	Set<String> nextPagesLinks;

	// to asign task by this number, whne used as single thread, set it to 1
	private int interleaving;

	private int threadID;

	SearchResultData searchResultData;

	/**
	 * @param args
	 * @throws SQLException
	 */
	public static void main(String[] args) throws SQLException {
		new SearchResult().run();
	}

	public SearchResult() throws SQLException {
		this.interleaving = 1;
		this.threadID = 0;
		constructor();

	}

	public SearchResult(int threadID, int interleaving) throws SQLException {
		this.interleaving = interleaving;
		this.threadID = threadID;
		constructor();
	}

	private void constructor() throws SQLException {
		cnn = DataAccess.getNewConnection();
		links = new HashSet<String>();
		nextPagesLinks = new HashSet<String>();
		searchResultData = new SearchResultData(cnn);
	}

	private int getMinID() throws SQLException {
		PreparedStatement ps;
		ResultSet rs;
		ps = cnn
				.prepareStatement("SELECT min(id)as minid FROM search_terms s where tag=0");
		rs = ps.executeQuery();
		if (rs.next())
			return rs.getInt("minid");
		else
			return Integer.MAX_VALUE;
	}

	public void parseAllPages() {
		PreparedStatement ps = null;
		ResultSet rs;
		try {
			int low = 0, high = 0;
			cnn.setAutoCommit(false);

			if (term1.equals("")) {
				low = getMinID();
				high = (int) (low + Math.round(MAXCOUNT * 1.5));
				ps = cnn.prepareStatement("SELECT s.id,s.term, s.tag "
						+ "FROM search_terms s "
						+ "where tag=0 and id%?=? and id between ? and ?");
				ps.setInt(1, interleaving);
				ps.setInt(2, threadID);
				ps.setInt(3, low);
				ps.setInt(4, high);
			} else {
				low = Integer.MIN_VALUE;
				high = Integer.MAX_VALUE;
				ps = cnn.prepareStatement("SELECT s.id,s.term, s.tag "
						+ "FROM search_terms s " + "where term like '" + term1
						+ "'");
			}

			rs = ps.executeQuery();
			while (rs.next() && (totalPageCount < MAXCOUNT)) {
				parseATerm(rs.getInt("id"), rs.getString("term"));
				cnn.commit();
			}
		} catch (SQLException e) {
			try {
				cnn.rollback();
			} catch (SQLException e1) {
				e1.printStackTrace();
			}
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}

		try {
			cnn.close();
		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	private void parseAPage(String pageLink) throws IOException {
		// page = getARawPage(pageLink);
		page = Util.getAPage(pageLink,10000);
		// Util.saveAFile("c:\\temp3" + id + ".html", page);
		if (page == "" || page == null)
			return;
		parseLinks(pattern, links);
	}

	private void parseATerm(int id, String term) throws SQLException,
			IOException {
		int insertCount = 0;
		int i = 0;
		links.clear();
		nextPagesLinks.clear();
		parseAPage(se.quesryString() + term + se.quesryStringAppend());
		parseLinks(patternNextPages, nextPagesLinks);
		for (String s : nextPagesLinks)
			parseAPage(se.host() + s);
		insertCount = searchResultData.insert(links,2);
		setStatus(id, 1);
		showTask(insertCount, term);
	}

	private void showTask(int insertCount, String term) {
		totalPageCount += 1;
		totalLinkCount += links.size();
		totalLinkInsertCount += insertCount;
		recentLinkInsertCount += insertCount;
		System.out.println("Links found by this term:" + term + ":"
				+ links.size() + "/Links inserted:" + insertCount);
		timeEplased = new Date().getTime() - taskStart.getTime();
		System.out.println("Total terms count:" + totalPageCount
				+ "/Total links found:" + totalLinkCount
				+ "/ Total link insert:" + totalLinkInsertCount);
		System.out.println("Recent links inserted:" + recentLinkInsertCount
				+ ". /Time elapsed:" + timeEplased / 60000 + " minutes");
		if (totalPageCount % 10 == 0)
			recentLinkInsertCount = 0;
	}

	private void setStatus(int id, int i) throws SQLException {
		PreparedStatement ps;
		ps = cnn.prepareStatement("update search_terms set tag=? where id=?");
		ps.setInt(1, i);
		ps.setInt(2, id);
		ps.execute();
	}

	public void parseLinks(String pattern, Set<String> set) {

		org.htmlparser.Parser p = new Parser();
		try {
			p.setInputHTML(page);
		} catch (ParserException e) {
			e.printStackTrace();
		}
		Lexer l = p.getLexer();
		org.htmlparser.nodes.AbstractNode n;
		String link = "";
		try {
			while ((n = (AbstractNode) l.nextNode()) != null) {
				if (n instanceof org.htmlparser.nodes.TagNode)
					if ((link = ((org.htmlparser.nodes.TagNode) n)
							.getAttribute("href")) != null)
						if (link.contains(pattern))
							set.add(link);
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
	}

	public synchronized void run() {
		parseAllPages();
		System.out.println("--------------------Thread "
				+ Thread.currentThread().getName()
				+ " Finished-----------------");

	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -