⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 urlgetter.java

📁 是个java写的sipder,非常不错!能承受很大的压力,每天采集的数量在10000万
💻 JAVA
字号:
/*
 * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
 *
 * Copyright (c) 2001 Brian Pitcher
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

// $Header: /CVSRepository/spider/cn/yicha/subject/spider/fecther/URLGetter.java,v 1.3 2006/02/16 04:35:27 zhangdi Exp $
package cn.yicha.subject.spider.fecther;

import java.net.*;
import java.io.*;

import org.apache.log4j.Category;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.*;

import cn.yicha.common.util.Log4j;
import cn.yicha.common.util.UTF8Util;
import cn.yicha.subject.spider.SpiderConfig;
import cn.yicha.subject.spider.URLObject;
import cn.yicha.subject.spider.URLToDownload;

public class URLGetter {
	private static boolean useIr = false;

	private String _mobile_no;

	private String _user_agent;

	private String _referer;

	private String _proxy_host;

	private String _proxy_port;

	private String _end_url;

	private String _content_type = "";

	private final SpiderConfig config;

	private static final int _CONNECT_TIMEOUT = 30000;

	private static final int _DOWN_TIMEOUT = 30000;

	public static final String _INVALID_TYPE = "invalid content type --> ";

	public static final String _CONTENT_TYPE_PREFIX = "Content-Type";
	
	public final static String _DEFAULT_MOBILE_NO = "13439230694";

	// 初始化日志接口
	private final static Category _logClass = Category
			.getInstance(URLGetter.class);
	static {
		Log4j.init();
	}

	public URLGetter(SpiderConfig config) {
		this.config = config;
	}

	/**
	 * 取得HTTP访问实例
	 */
	private HttpClient getHttpClientInstance(String host, int port,
			String proxyHost, String proxyPort) {

		System.setProperty("org.apache.commons.logging.Log",
				"org.apache.commons.logging.impl.SimpleLog");
		System.setProperty("org.apache.commons.logging.simplelog.showdatetime",
				"true");
		// System.setProperty("org.apache.commons.logging.simplelog.log.httpclient.wire.header",
		// "debug");
		System
				.setProperty(
						"org.apache.commons.logging.simplelog.log.org.apache.commons.httpclient",
						"info");

		HttpClient client = new HttpClient();
		client.setConnectionTimeout(_CONNECT_TIMEOUT);
		client.setTimeout(_DOWN_TIMEOUT);
		client.getHostConfiguration().setHost(host, port);

		if (!proxyHost.equals("") && !proxyPort.equals("")) {
			client.getHostConfiguration().setProxy(proxyHost,
					Integer.parseInt(proxyPort));
		}
		return client;
	}

	/**
	 * 取得HTTP访问实例
	 */
	private HttpClient getHttpClientInstance(URL url, String proxyHost,
			String proxyPort) {
		return getHttpClientInstance(url.getHost(), url.getPort(), proxyHost,
				proxyPort);
	}

	/**
	 * 设置手机模拟访问头
	 */
	private void setMobileRequestHeader(HttpMethod hm, String mobileNo,
			String userAgent, URL referer, String host) {
		hm.setRequestHeader("connection", "Keep-Alive");
		hm
				.setRequestHeader(
						"accept",
						"application/vnd.wap.wmlc, application/vnd.wap.wbxml, application/vnd.wap.wmlscriptc, application/xhtml+xml, application/vnd.wap.xhtml+xml, text/html, application/vnd.wap.mms-message, text/css, */*, text/x-vcard, text/x-vcalendar, image/vnd.wap.wbmp, image/gif, */*");
		hm.setRequestHeader("accept-language", "zh");
		hm.setRequestHeader("accept-charset",
				"US-ASCII, ISO-8859-1, UTF-8, ISO-10646-UCS-2");
		hm.setRequestHeader("host", host);
		hm.setRequestHeader("user-agent", userAgent);
		if (referer != null) {
			hm.setRequestHeader("referer", referer.toExternalForm());
		}
		hm.setRequestHeader("bearer-indication", "0");
		hm.setRequestHeader("accept-application", "1,2");
		hm
				.setRequestHeader(
						"via",
						"WTP/1.1 GDGZ-PS-GW004-WAP02.gd.chinamobile.com (Nokia WAP Gateway 4.0/CD3/4.0.04)");
		hm.setRequestHeader("x-network-info",
				"GPRS,8613824478284,10.101.74.116,cmwap,unsecured");
		hm.setRequestHeader("x-forwarded-for", "10.101.74.116");
		// _logClass.info("mobileNo: " + mobileNo);
		hm.addRequestHeader("x-up-calling-line-id", mobileNo);
		// hm.addRequestHeader("x-up-calling-line-id", "13824478114");
		hm.setRequestHeader("x-source-id", "cmwap");
		hm.setRequestHeader("x-nokia-connection_mode", "CMODE");
		hm.setRequestHeader("x-up-bearer-type", "GPRS");
		hm.setRequestHeader("x-nokia-gateway-id", "NWG/4.0/CD3/Build04");
		hm.setRequestHeader("x-wap-profile",
				"http://wap.sonyericsson.com/UAprof/T238R101.xml");
	}

	/**
	 * 打印服务器返回串
	 */
	private static void printResponse(HttpMethod hm) {
		try {
			System.out.println("\n\nresponse:"
					+ UTF8Util.UTF2GB(hm.getResponseBodyAsString()) + "\n\n");
		} catch (Exception e) {
			e.printStackTrace();
			System.exit(0);
		}

	}

	protected byte[] getURL(URL url, String userAgent, String mobileNo,
			URL referer, String proxyHost, String proxyPort)
			throws URIException, IOException {
		HttpClient client = getHttpClientInstance(url, proxyHost, proxyPort);

//		// 太长则放弃
//		if (url.toExternalForm().length() > 256) {
//			throw new IOException();
//		}
		
		// 设置请求头
		TestGetMethod gm = new TestGetMethod(url.toExternalForm());

		setMobileRequestHeader(gm, mobileNo, userAgent, referer, url.getHost());
		gm.setFollowRedirects(true);

		// 获取页面

		// try {
		// _logClass.info("begin to execute GetMethod method");
		int result = client.executeMethod(gm);
		_logClass.info("status code --> " + result);

		// printResponse(gm);
		byte[] content = gm.getResponseBody();
		// InputStream is = gm.getResponseBodyAsStream();
		// byte[] content = new byte[is.available()];
		// is.read(content, 0, content.length);
		// System.out.println(new String(content));

		// 设置相关属性
		setEndUrl(gm.getURI().toString());
		Header header = gm.getResponseHeader(_CONTENT_TYPE_PREFIX);
		if (header != null) {
			setContentType(header.getValue());
		}

		// 释放连接
		gm.releaseConnection();

		return content;
	}

	protected byte[] getURLByIr(URL url, String userAgent, String mobileNo,
			URL referer, String proxyHost, String proxyPort)
			throws URIException, IOException {
		
		String cont = URLReader.readUrlContentByProxy(url.toExternalForm(), proxyHost, proxyPort, false);
		byte[] content = cont.getBytes();
		
		// 设置相关属性
		setEndUrl(url.toExternalForm());
		setContentType(URLReader.get_content_type());

		return content;
	}

	/**
	 * 下载网页
	 */
	public URLObject getURL(URLToDownload url) {
		URL requestedURL = url.getURL();
		URL referer = url.getReferer();

		int tryCount = 0;
		int errorType = URLObject._OTHER_EXCEPTION;
		while (tryCount++ < config.getMaxTryCount()) {
			try {
				// _logClass.info("begin to try downloading...");
				// 获取最终网页内容
				byte[] content = null;
				if (useIr) {
					content = getURLByIr(requestedURL, config.getUserAgent(),
							_DEFAULT_MOBILE_NO, referer, config
									.getProxyHost(), config.getProxyPort());
				} else {
					content = getURL(requestedURL, config.getUserAgent(),
							_DEFAULT_MOBILE_NO, referer, config
									.getProxyHost(), config.getProxyPort());
				}

				if (content == null) {
					continue;
				}
				// _logClass.info("have downloaded url " +
				// requestedURL.toExternalForm());

				// 考虑到有些网站做了重定向处理,此处设置最终下载页的URL
				url.setEndURL(new URL(getEndUrl()));

				return new URLObject(requestedURL, getContentType(), content,
						config, url.getServiceID(), url.getIsBeforeSubs());
			} catch (URIException hre) {
				/*
				 * // 判断网页类型,如果是铃声类型,则存储铃声下载页的URL地址和铃声类型 String contentType =
				 * getContentType(hre.getReason()); if (isRingType(contentType)) {
				 * _logClass.info("log ring file [" +
				 * requestedURL.toExternalForm() + "], [" + contentType + "]");
				 * saveMediaUrl(requestedURL.toExternalForm(), contentType,
				 * config.getSaveRingFile()); return new URLObject(requestedURL,
				 * contentType, null, config, url.getServiceID(),
				 * url.getIsBeforeSubs()); } else if (isGameType(contentType)) {
				 * _logClass.info("log game file [" +
				 * requestedURL.toExternalForm() + "], [" + contentType + "]");
				 * saveMediaUrl(requestedURL.toExternalForm(), contentType,
				 * config.getSaveGameFile()); return new URLObject(requestedURL,
				 * contentType, null, config, url.getServiceID(),
				 * url.getIsBeforeSubs()); }
				 */
				_logClass.info("special url: " + url.toString());
				break;
			} catch (FileNotFoundException fnfe) {
				_logClass.warn("exception url --> "
						+ requestedURL.toExternalForm());
				_logClass.warn("File not found:	" + fnfe.getMessage());
				errorType = URLObject._FILE_NOT_FOUND_EXCEPTION;
				continue;
			} catch (IOException ioe) {
				_logClass.warn("exception url --> "
						+ requestedURL.toExternalForm());
				_logClass.warn("Caught IO Exception: " + ioe.getMessage(), ioe);

				errorType = URLObject._OTHER_EXCEPTION;
				if (ioe.getMessage() != null
						&& ioe.getMessage().toLowerCase().indexOf("timed out") >= 0) {
					errorType = URLObject._CONNECT_TIMEOUT_EXCEPTION;
				}
				continue;
			} catch (Exception ex) {
				_logClass.warn("exception url --> "
						+ requestedURL.toExternalForm());
				_logClass.warn("Caught IO Exception: " + ex.getMessage(), ex);

				continue;
			}
		}

		return new URLObject(requestedURL, errorType);
	}

	/**
	 * 判断包头类型是否属于无效包
	 */
	private static boolean isInvalidType(String exception) {
		if (exception == null) {
			return false;
		}

		String contentType = getContentType(exception);
		if (isGameType(contentType) || isRingType(contentType)) {
			return true;
		} else {
			return false;
		}
	}

	/**
	 * 从异常信息中提取异常信息
	 */
	private static String getContentType(String exception) {
		int pos = exception.indexOf(_INVALID_TYPE);
		if (pos < 0) {
			return exception;
		} else {
			return exception.substring(pos + 1);
		}
	}

	/**
	 * 存储铃声下载页或游戏下载页的URL地址和铃声类型
	 */
	private void saveMediaUrl(String url, String contentType, String fileName) {
		String log = url + "\t" + contentType;

		try {
			File f = new File(fileName);
			f.getParentFile().mkdirs();

			PrintWriter pw = new PrintWriter(new OutputStreamWriter(
					new FileOutputStream(fileName, true)));
			pw.println(log);
			pw.close();
		} catch (IOException ioe) {
			_logClass.warn("IO Exception writing to " + fileName, ioe);
		}
	}

	/**
	 * 判断页面类型是否是铃声下载页面
	 */
	private static boolean isRingType(String contentType) {
		if (contentType == null) {
			return false;
		}

		String[] ringTypes = { "audio/midi", "audio/amr", "audio/mpeg",
				"application/x-smaf", "audio/rmf", "audio/x-ms-wma",
				"audio/wav", "audio/imelody", "audio/aac", "application/x-mfm" };

		for (int i = 0; i < ringTypes.length; i++) {
			if (contentType.indexOf(ringTypes[i]) >= 0) {
				return true;
			}
		}

		return false;
	}

	/**
	 * 判断页面类型是否是游戏下载页面
	 */
	private static boolean isGameType(String contentType) {
		if (contentType == null) {
			return false;
		}

		String[] gameTypes = { "text/vnd.sun.j2me.app-descriptor",
				"application/vnd.symbian.install" };

		for (int i = 0; i < gameTypes.length; i++) {
			if (contentType.indexOf(gameTypes[i]) >= 0) {
				return true;
			}
		}

		return false;
	}

	public String getMobileNo() {
		return _mobile_no;
	}

	public void setMobileNo(String _mobile_no) {
		this._mobile_no = _mobile_no;
	}

	public String getUserAgent() {
		return _user_agent;
	}

	public void setUserAgent(String _user_agent) {
		this._user_agent = _user_agent;
	}

	public String getReferer() {
		return _referer;
	}

	public void setReferer(String _referer) {
		this._referer = _referer;
	}

	public String getProxyHost() {
		return _proxy_host;
	}

	public void setProxyHost(String _proxy_host) {
		this._proxy_host = _proxy_host;
	}

	public String getProxyPort() {
		return _proxy_port;
	}

	public void setProxyPort(String _proxy_port) {
		this._proxy_port = _proxy_port;
	}

	public String getEndUrl() {
		return _end_url;
	}

	public void setEndUrl(String _end_url) {
		this._end_url = _end_url;
	}

	public String getContentType() {
		return _content_type;
	}

	public void setContentType(String _content_type) {
		this._content_type = _content_type;
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -