📄 urlgetter.java
字号:
/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
// $Header: /CVSRepository/spider/cn/yicha/subject/spider/fecther/URLGetter.java,v 1.3 2006/02/16 04:35:27 zhangdi Exp $
package cn.yicha.subject.spider.fecther;
import java.net.*;
import java.io.*;
import org.apache.log4j.Category;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.*;
import cn.yicha.common.util.Log4j;
import cn.yicha.common.util.UTF8Util;
import cn.yicha.subject.spider.SpiderConfig;
import cn.yicha.subject.spider.URLObject;
import cn.yicha.subject.spider.URLToDownload;
public class URLGetter {
private static boolean useIr = false;
private String _mobile_no;
private String _user_agent;
private String _referer;
private String _proxy_host;
private String _proxy_port;
private String _end_url;
private String _content_type = "";
private final SpiderConfig config;
private static final int _CONNECT_TIMEOUT = 30000;
private static final int _DOWN_TIMEOUT = 30000;
public static final String _INVALID_TYPE = "invalid content type --> ";
public static final String _CONTENT_TYPE_PREFIX = "Content-Type";
public final static String _DEFAULT_MOBILE_NO = "13439230694";
// 初始化日志接口
private final static Category _logClass = Category
.getInstance(URLGetter.class);
static {
Log4j.init();
}
public URLGetter(SpiderConfig config) {
this.config = config;
}
/**
* 取得HTTP访问实例
*/
private HttpClient getHttpClientInstance(String host, int port,
String proxyHost, String proxyPort) {
System.setProperty("org.apache.commons.logging.Log",
"org.apache.commons.logging.impl.SimpleLog");
System.setProperty("org.apache.commons.logging.simplelog.showdatetime",
"true");
// System.setProperty("org.apache.commons.logging.simplelog.log.httpclient.wire.header",
// "debug");
System
.setProperty(
"org.apache.commons.logging.simplelog.log.org.apache.commons.httpclient",
"info");
HttpClient client = new HttpClient();
client.setConnectionTimeout(_CONNECT_TIMEOUT);
client.setTimeout(_DOWN_TIMEOUT);
client.getHostConfiguration().setHost(host, port);
if (!proxyHost.equals("") && !proxyPort.equals("")) {
client.getHostConfiguration().setProxy(proxyHost,
Integer.parseInt(proxyPort));
}
return client;
}
/**
* 取得HTTP访问实例
*/
private HttpClient getHttpClientInstance(URL url, String proxyHost,
String proxyPort) {
return getHttpClientInstance(url.getHost(), url.getPort(), proxyHost,
proxyPort);
}
/**
* 设置手机模拟访问头
*/
private void setMobileRequestHeader(HttpMethod hm, String mobileNo,
String userAgent, URL referer, String host) {
hm.setRequestHeader("connection", "Keep-Alive");
hm
.setRequestHeader(
"accept",
"application/vnd.wap.wmlc, application/vnd.wap.wbxml, application/vnd.wap.wmlscriptc, application/xhtml+xml, application/vnd.wap.xhtml+xml, text/html, application/vnd.wap.mms-message, text/css, */*, text/x-vcard, text/x-vcalendar, image/vnd.wap.wbmp, image/gif, */*");
hm.setRequestHeader("accept-language", "zh");
hm.setRequestHeader("accept-charset",
"US-ASCII, ISO-8859-1, UTF-8, ISO-10646-UCS-2");
hm.setRequestHeader("host", host);
hm.setRequestHeader("user-agent", userAgent);
if (referer != null) {
hm.setRequestHeader("referer", referer.toExternalForm());
}
hm.setRequestHeader("bearer-indication", "0");
hm.setRequestHeader("accept-application", "1,2");
hm
.setRequestHeader(
"via",
"WTP/1.1 GDGZ-PS-GW004-WAP02.gd.chinamobile.com (Nokia WAP Gateway 4.0/CD3/4.0.04)");
hm.setRequestHeader("x-network-info",
"GPRS,8613824478284,10.101.74.116,cmwap,unsecured");
hm.setRequestHeader("x-forwarded-for", "10.101.74.116");
// _logClass.info("mobileNo: " + mobileNo);
hm.addRequestHeader("x-up-calling-line-id", mobileNo);
// hm.addRequestHeader("x-up-calling-line-id", "13824478114");
hm.setRequestHeader("x-source-id", "cmwap");
hm.setRequestHeader("x-nokia-connection_mode", "CMODE");
hm.setRequestHeader("x-up-bearer-type", "GPRS");
hm.setRequestHeader("x-nokia-gateway-id", "NWG/4.0/CD3/Build04");
hm.setRequestHeader("x-wap-profile",
"http://wap.sonyericsson.com/UAprof/T238R101.xml");
}
/**
* 打印服务器返回串
*/
private static void printResponse(HttpMethod hm) {
try {
System.out.println("\n\nresponse:"
+ UTF8Util.UTF2GB(hm.getResponseBodyAsString()) + "\n\n");
} catch (Exception e) {
e.printStackTrace();
System.exit(0);
}
}
protected byte[] getURL(URL url, String userAgent, String mobileNo,
URL referer, String proxyHost, String proxyPort)
throws URIException, IOException {
HttpClient client = getHttpClientInstance(url, proxyHost, proxyPort);
// // 太长则放弃
// if (url.toExternalForm().length() > 256) {
// throw new IOException();
// }
// 设置请求头
TestGetMethod gm = new TestGetMethod(url.toExternalForm());
setMobileRequestHeader(gm, mobileNo, userAgent, referer, url.getHost());
gm.setFollowRedirects(true);
// 获取页面
// try {
// _logClass.info("begin to execute GetMethod method");
int result = client.executeMethod(gm);
_logClass.info("status code --> " + result);
// printResponse(gm);
byte[] content = gm.getResponseBody();
// InputStream is = gm.getResponseBodyAsStream();
// byte[] content = new byte[is.available()];
// is.read(content, 0, content.length);
// System.out.println(new String(content));
// 设置相关属性
setEndUrl(gm.getURI().toString());
Header header = gm.getResponseHeader(_CONTENT_TYPE_PREFIX);
if (header != null) {
setContentType(header.getValue());
}
// 释放连接
gm.releaseConnection();
return content;
}
protected byte[] getURLByIr(URL url, String userAgent, String mobileNo,
URL referer, String proxyHost, String proxyPort)
throws URIException, IOException {
String cont = URLReader.readUrlContentByProxy(url.toExternalForm(), proxyHost, proxyPort, false);
byte[] content = cont.getBytes();
// 设置相关属性
setEndUrl(url.toExternalForm());
setContentType(URLReader.get_content_type());
return content;
}
/**
* 下载网页
*/
public URLObject getURL(URLToDownload url) {
URL requestedURL = url.getURL();
URL referer = url.getReferer();
int tryCount = 0;
int errorType = URLObject._OTHER_EXCEPTION;
while (tryCount++ < config.getMaxTryCount()) {
try {
// _logClass.info("begin to try downloading...");
// 获取最终网页内容
byte[] content = null;
if (useIr) {
content = getURLByIr(requestedURL, config.getUserAgent(),
_DEFAULT_MOBILE_NO, referer, config
.getProxyHost(), config.getProxyPort());
} else {
content = getURL(requestedURL, config.getUserAgent(),
_DEFAULT_MOBILE_NO, referer, config
.getProxyHost(), config.getProxyPort());
}
if (content == null) {
continue;
}
// _logClass.info("have downloaded url " +
// requestedURL.toExternalForm());
// 考虑到有些网站做了重定向处理,此处设置最终下载页的URL
url.setEndURL(new URL(getEndUrl()));
return new URLObject(requestedURL, getContentType(), content,
config, url.getServiceID(), url.getIsBeforeSubs());
} catch (URIException hre) {
/*
* // 判断网页类型,如果是铃声类型,则存储铃声下载页的URL地址和铃声类型 String contentType =
* getContentType(hre.getReason()); if (isRingType(contentType)) {
* _logClass.info("log ring file [" +
* requestedURL.toExternalForm() + "], [" + contentType + "]");
* saveMediaUrl(requestedURL.toExternalForm(), contentType,
* config.getSaveRingFile()); return new URLObject(requestedURL,
* contentType, null, config, url.getServiceID(),
* url.getIsBeforeSubs()); } else if (isGameType(contentType)) {
* _logClass.info("log game file [" +
* requestedURL.toExternalForm() + "], [" + contentType + "]");
* saveMediaUrl(requestedURL.toExternalForm(), contentType,
* config.getSaveGameFile()); return new URLObject(requestedURL,
* contentType, null, config, url.getServiceID(),
* url.getIsBeforeSubs()); }
*/
_logClass.info("special url: " + url.toString());
break;
} catch (FileNotFoundException fnfe) {
_logClass.warn("exception url --> "
+ requestedURL.toExternalForm());
_logClass.warn("File not found: " + fnfe.getMessage());
errorType = URLObject._FILE_NOT_FOUND_EXCEPTION;
continue;
} catch (IOException ioe) {
_logClass.warn("exception url --> "
+ requestedURL.toExternalForm());
_logClass.warn("Caught IO Exception: " + ioe.getMessage(), ioe);
errorType = URLObject._OTHER_EXCEPTION;
if (ioe.getMessage() != null
&& ioe.getMessage().toLowerCase().indexOf("timed out") >= 0) {
errorType = URLObject._CONNECT_TIMEOUT_EXCEPTION;
}
continue;
} catch (Exception ex) {
_logClass.warn("exception url --> "
+ requestedURL.toExternalForm());
_logClass.warn("Caught IO Exception: " + ex.getMessage(), ex);
continue;
}
}
return new URLObject(requestedURL, errorType);
}
/**
* 判断包头类型是否属于无效包
*/
private static boolean isInvalidType(String exception) {
if (exception == null) {
return false;
}
String contentType = getContentType(exception);
if (isGameType(contentType) || isRingType(contentType)) {
return true;
} else {
return false;
}
}
/**
* 从异常信息中提取异常信息
*/
private static String getContentType(String exception) {
int pos = exception.indexOf(_INVALID_TYPE);
if (pos < 0) {
return exception;
} else {
return exception.substring(pos + 1);
}
}
/**
* 存储铃声下载页或游戏下载页的URL地址和铃声类型
*/
private void saveMediaUrl(String url, String contentType, String fileName) {
String log = url + "\t" + contentType;
try {
File f = new File(fileName);
f.getParentFile().mkdirs();
PrintWriter pw = new PrintWriter(new OutputStreamWriter(
new FileOutputStream(fileName, true)));
pw.println(log);
pw.close();
} catch (IOException ioe) {
_logClass.warn("IO Exception writing to " + fileName, ioe);
}
}
/**
* 判断页面类型是否是铃声下载页面
*/
private static boolean isRingType(String contentType) {
if (contentType == null) {
return false;
}
String[] ringTypes = { "audio/midi", "audio/amr", "audio/mpeg",
"application/x-smaf", "audio/rmf", "audio/x-ms-wma",
"audio/wav", "audio/imelody", "audio/aac", "application/x-mfm" };
for (int i = 0; i < ringTypes.length; i++) {
if (contentType.indexOf(ringTypes[i]) >= 0) {
return true;
}
}
return false;
}
/**
* 判断页面类型是否是游戏下载页面
*/
private static boolean isGameType(String contentType) {
if (contentType == null) {
return false;
}
String[] gameTypes = { "text/vnd.sun.j2me.app-descriptor",
"application/vnd.symbian.install" };
for (int i = 0; i < gameTypes.length; i++) {
if (contentType.indexOf(gameTypes[i]) >= 0) {
return true;
}
}
return false;
}
public String getMobileNo() {
return _mobile_no;
}
public void setMobileNo(String _mobile_no) {
this._mobile_no = _mobile_no;
}
public String getUserAgent() {
return _user_agent;
}
public void setUserAgent(String _user_agent) {
this._user_agent = _user_agent;
}
public String getReferer() {
return _referer;
}
public void setReferer(String _referer) {
this._referer = _referer;
}
public String getProxyHost() {
return _proxy_host;
}
public void setProxyHost(String _proxy_host) {
this._proxy_host = _proxy_host;
}
public String getProxyPort() {
return _proxy_port;
}
public void setProxyPort(String _proxy_port) {
this._proxy_port = _proxy_port;
}
public String getEndUrl() {
return _end_url;
}
public void setEndUrl(String _end_url) {
this._end_url = _end_url;
}
public String getContentType() {
return _content_type;
}
public void setContentType(String _content_type) {
this._content_type = _content_type;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -