⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 util.java

📁 利用多线程从搜索引擎下载网页并提取数据到数据库。
💻 JAVA
字号:
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Locale;
import java.util.Random;

import org.htmlparser.Parser;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.nodes.AbstractNode;
import org.htmlparser.util.ParserException;
import java.util.regex.*;

/**
 * Some misc methods: Read search terms Get webpages
 * 
 * @author james
 * 
 */

public class Util {

	/**
	 * @param args
	 */

	public static void main(String[] args) {

String href="http://66.218.69.11/search/cache?ipc=1&.intl=&u=www.linkedin.com/pub/0/4/049&d=DjurWHDuP_RL&p=http%3A%2F%2Fwww.linkedin.com%2Fpub%2F0%2F4";
		//System.out.println(new java.net.URL(href).toExternalForm());
		try {
			System.out.println(java.net.URLDecoder.decode(href,"utf-8"));
		} catch (UnsupportedEncodingException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
		// TODO Auto-generated catch block
	
	}

	public static float getARand() {
		Random rand = new Random(1);
		return rand.nextFloat();
	}

	public synchronized static void saveAFile(String fileName,
			String FileContent) {
		FileWriter fr;
		try {
			fr = new FileWriter(fileName);
			fr.write(FileContent);
			fr.close();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

	public synchronized static String getAPageLocal(String pageLink)
			throws IOException {
		final int cBufSize = 2000;
		char[] chars = new char[cBufSize];
		FileReader fr = null;
		try {
			fr = new FileReader(pageLink);

		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		BufferedReader br = new BufferedReader(fr);
		StringBuilder sb = new StringBuilder();

		int i;
		while ((i = br.read(chars, 0, cBufSize)) > 0) {
			sb.append(chars, 0, i);
		}

		return sb.toString();
	}

	public synchronized static String getAPage(String pageLink, int timeout)
			throws IOException {

		URL url = null;
		HttpURLConnection urlCon = null;
		BufferedReader br = null;
		InputStreamReader isr;
		StringBuilder sb = new StringBuilder();
		int i;
		final int cBufSize = 1000;
		char[] cbuf = new char[cBufSize];
		url = new URL(pageLink);
		
		urlCon = (HttpURLConnection) url.openConnection();
		urlCon.setRequestProperty("User-agent", "IE/6.0");
		urlCon.setReadTimeout(timeout);
		urlCon.setConnectTimeout(timeout);

		isr = new InputStreamReader(urlCon.getInputStream());
		br = new BufferedReader(isr);

		while ((i = br.read(cbuf, 0, cBufSize)) > 0) {
			sb.append(cbuf, 0, i);
		}
		String page = sb.toString();

		if (page.contains("<html") && page.contains("</html>")
				&& !page.contains("<title>403 Forbidden</title>")
				|| page.contains("<HTML") && page.contains("</HTML>")
				&& !page.contains("<TITLE>403 Forbidden</TITLE>"))
			return page;
		else {
			throw new IOException(page);
		}
	}

	public synchronized static String getARawPage(String pageLink)
			throws IOException {

		// String page = "";
		// DataInputStream dis = null;
		java.net.URL url = null;
		// BufferedInputStream bis=null;
		BufferedReader br = null;
		InputStreamReader isr;
		String s = null;
		StringBuilder sb = new StringBuilder();
		int i;
		final int cBufSize = 1000;
		char[] cbuf = new char[cBufSize];
		url = new java.net.URL(pageLink);
		isr = new InputStreamReader(url.openStream());
		br = new BufferedReader(isr);
		while ((i = br.read(cbuf, 0, cBufSize)) > 0) {
			sb.append(cbuf, 0, i);
		}

		return sb.toString();
	}

	public synchronized static ArrayList<String> getSearchTerms(String fileName) {
		ArrayList<String> terms = new ArrayList<String>();
		FileReader reader;
		BufferedReader bfReader = null;

		try {
			reader = new FileReader(fileName);
			bfReader = new BufferedReader(reader);
			while (bfReader.ready())
				terms.add(bfReader.readLine());
		} catch (FileNotFoundException fnfe) {
			// TODO Auto-generated catch block
			fnfe.printStackTrace();
		} catch (IOException ioe) {
			// TODO Auto-generated catch block
			ioe.printStackTrace();
		}
		return terms;
	}

	public synchronized static String cleanString(String rawString) {
		if (rawString == null || rawString == "")
			return "";

		rawString = rawString.replaceAll("&amp;", "&");
		rawString = rawString.replaceAll("&quot;", "\"");
		rawString = rawString.replaceAll("(\\A[^\\w]+)|([^\\w]+\\Z)", "");
		return rawString;

	}

	public synchronized static String cleanString2(String rawString) {
		String regx;
		Pattern p;
		Matcher m;

		if (rawString == null || rawString == "")
			return "";
		regx = "&amp;";
		p = Pattern.compile(regx);
		m = p.matcher(rawString);
		rawString = m.replaceAll("&");

		regx = "&quot;";
		p = Pattern.compile(regx);
		m = p.matcher(rawString);
		rawString = m.replaceAll("\"");

		regx = "(\\A[^\\w]+)|([^\\w]+\\Z)";
		p = Pattern.compile(regx);
		m = p.matcher(rawString);
		rawString = m.replaceAll("");
		return rawString;

	}

	public synchronized static int getNumberFromString(String rawString) {
		String regx = "\\d+";
		String numberString = null;
		int number;
		try {
			Pattern p = Pattern.compile(regx);
			Matcher m = p.matcher(rawString);
			if (m.find())
				numberString = m.group();
			number = new Integer(numberString).intValue();
		} catch (Exception e) {
			number = 0;
		}

		return number;
	}

	public synchronized static Date parseDate(String theDateString) {
		theDateString = Util.cleanString(theDateString);
		Date theDate = parseDate1(theDateString);
		if (theDate != null)
			return theDate;
		else
			return parseDate2(theDateString);
	}

	private static Date parseDate1(String theDateString) {
		Date theDate = null;
		String regx = "[a-zA-Z]{3,15}\\s+\\d{4}";

		Pattern p = Pattern.compile(regx);
		Matcher m = p.matcher(theDateString);
		if (!m.find())
			return null;

		theDateString = m.group();

		try {
			theDate = new SimpleDateFormat("MMMM yyyy", Locale.US)
					.parse(theDateString);
		} catch (ParseException e) {
			e.printStackTrace();
			return null;
		}

		return theDate;
	}

	private static Date parseDate2(String theDateString) {
		Date theDate = null;
		String regx = "\\d{4}(-\\d{2})?(-\\d{2})?";

		Pattern p = Pattern.compile(regx);
		Matcher m = p.matcher(theDateString);
		if (!m.find())
			return null;

		theDateString = m.group();
		if (theDateString.length() == 4)
			theDateString = theDateString + "-01-01";
		if (theDateString.length() == 7)
			theDateString = theDateString + "-01";

		try {
			theDate = new SimpleDateFormat("yyyy-MM-dd", Locale.US)
					.parse(theDateString);
		} catch (ParseException e) {
			e.printStackTrace();
			return null;
		}

		return theDate;
	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -