📄 urlobject.java
字号:
/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
// $Header: /CVSRepository/spider/cn/yicha/subject/spider/URLObject.java,v 1.2 2006/02/16 04:35:26 zhangdi Exp $
package cn.yicha.subject.spider;
import org.apache.log4j.Category;
import cn.yicha.common.util.Log4j;
import cn.yicha.common.util.StringUtil;
import cn.yicha.subject.spider.store.ExtractAnchor;
import java.io.*;
import java.net.URL;
import java.net.URLEncoder;
public class URLObject
{
private final static Category _logClass = Category.getInstance(URLObject.class);
static {
Log4j.init();
}
// 抓取URL的反馈标志
public static final int _SUCCEED = 0;
public static final int _CONNECT_TIMEOUT_EXCEPTION = 1;
public static final int _FILE_NOT_FOUND_EXCEPTION = 2;
public static final int _OTHER_EXCEPTION = 3;
// 定购前后标志目录显示串
private static final String _BEFORE_SUBS = "beforeSubs";
private static final String _AFTER_SUBS = "afterSubs";
private URL sourceURL = null;
private String contentType = "";
private byte[] content = null;
private String serviceID = "";
private int errorType = _SUCCEED;
private boolean isBeforeSubs = true;
private SpiderConfig config = null;
public URLObject(URL sourceURL, String contentType, byte[] content, SpiderConfig config, String serviceID, boolean subsStatus)
{
this.sourceURL = sourceURL;
this.contentType = contentType;
this.content = content;
this.config = config;
this.serviceID = serviceID;
this.isBeforeSubs = subsStatus;
}
public URLObject(URL sourceURL, SpiderConfig config, String serviceID, boolean subsStatus)
{
this.sourceURL = sourceURL;
this.config = config;
this.serviceID = serviceID;
this.isBeforeSubs = subsStatus;
String s = sourceURL.toExternalForm().toLowerCase();
if(s.indexOf(".jpg") != -1)
{
contentType = "image/jpeg";
}
else if(s.indexOf(".gif") != -1)
{
contentType = "image/gif";
}
else
{
contentType = "text/html";
}
if(existsOnDisk())
{
File f = new File(convertToFileName());
if(f.isDirectory())
{
f = new File(f, "index.html");
}
content = new byte[(int) f.length()];
try
{
FileInputStream in = new FileInputStream(f);
in.read(content);
in.close();
}
catch(IOException ioe)
{
_logClass.warn("IO Exception reading disk version of URL " + sourceURL, ioe);
}
}
else
{
content = new byte[0];
}
}
/**
* 构造函数,并不创建一个真正的网页对象,只是用于返回错误码
*/
public URLObject(URL sourceURL, int errorType)
{
this.sourceURL = sourceURL;
this.errorType = errorType;
}
public String getContentType()
{
return contentType;
}
public boolean isHTML()
{
return contentType.toLowerCase().startsWith("text/html");
}
public boolean isXML()
{
return contentType.toLowerCase().startsWith("text/xml");
}
public boolean isWML()
{
return contentType.toLowerCase().startsWith("text/vnd.wap.wml");
}
public boolean isXHTMLVer1()
{
return contentType.toLowerCase().startsWith("application/vnd.wap");
}
/**
* 根据下载页面内容判断是否是XHTML
*/
public boolean isXHTMLVer2()
{
if (content == null) {
return false;
}
String source = new String(content);
String[] xmlns = ExtractAnchor.extractAttributesFromTag("html", "xmlns", source);
if (xmlns.length > 0) {
if (xmlns[0].indexOf("xhtml") >= 0) {
return true;
}
}
return false;
}
/**
* 判断网页对象是否是正常页面,如果是铃声下载页面,则把URL地址和铃声类型存储到文件
*/
public boolean isValidDoc()
{
String contentType = getContentType();
if (contentType == null) {
return false;
}
boolean ret = (isWML() || isXML() || isHTML() || isXHTMLVer1() || isXHTMLVer2());
if (!ret) {
_logClass.info("invalid content type --> " + contentType);
}
return ret;
}
public boolean isImage()
{
return contentType.startsWith("image/");
}
public String getStringContent()
{
String c = null;
try {
//System.out.println("length of content: " + content.length);
c = new String(content);
//System.out.println(c);
} catch (Exception e) {
e.printStackTrace();
System.exit(0);
}
return c;
}
/**
* 取得文件存储路径
*/
public String getFilePath()
{
// 定购前后标志需添加目录,以便分析
String subs = "";
if (getIsBeforeSubs()) {
subs = _BEFORE_SUBS;
}
else {
subs = _AFTER_SUBS;
}
String filePath;
if (config.downloadMonternet()) {
filePath = config.getSaveRootDirectory().getPath() + "/" + getServiceID() + "/" + subs;
}
else {
filePath = config.getSaveRootDirectory().getPath();
}
return filePath;
}
/**
* 把URL转换为可存储的文件名
*/
public String convertToFileName()
{
String url = sourceURL.toExternalForm();
// 去除HTTP前缀
int httpIdx = url.indexOf("http://");
if(httpIdx == 0) {
url = url.substring(7);
}
// Check for at least one slash -- otherwise host name (e.g. sourceforge.net)
if(url.indexOf("/") < 0) {
url = url + "/";
}
// If trailing slash, add index.wml as default
if(url.endsWith("/")) {
url = url + "index.wml";
}
// _logClass.info("raw url --> " + url);
// 替换文件中的特殊字符
String[] tagList = {"?", "&", ":", "<", ">", "|"};
for (int i=0; i < tagList.length; i++) {
url = StringUtil.textReplace(tagList[i], URLEncoder.encode(tagList[i]), url);
}
// 去除文件中的特殊字符
String[] delList = {"*"};
for (int j=0; j < delList.length; j++) {
url = StringUtil.textReplace(delList[j], "", url);
}
// _logClass.info("transformed url --> " + url);
return getFilePath() + "/" + url;
}
public boolean existsOnPrevLog() {
String url = sourceURL.toExternalForm();
return config.getHsPreLinks().contains(url);
}
public boolean existsOnDisk()
{
File f = new File(convertToFileName());
return (f.exists() && !f.isDirectory());
}
public void writeToFile()
{
writeToFile(convertToFileName());
}
public void writeToFile(String fileName)
{
String rootPath = fileName.split("%3F|\\?")[0];
String subPath = fileName.substring(rootPath.length());
// 在写入之前对参数字符串中的特殊字符做替换
String[] tagList = {"/", "\\", ":", "*", "?", "&", ":", "<", ">", "|" };
for (int i = 0; i < tagList.length; i++) {
subPath = StringUtil.textReplace(tagList[i],
URLEncoder.encode(tagList[i]),
subPath);
}
if (subPath.length() > 128) {
_logClass.info("too long path...");
errorType = URLObject._OTHER_EXCEPTION;
return;
}
fileName = rootPath.concat(subPath);
_logClass.info("writeToFile(" + fileName + ")");
try
{
File f = new File(fileName);
File pf = f.getParentFile();
pf.mkdirs();
f.createNewFile();
FileOutputStream out = new FileOutputStream(fileName);
out.write(content);
out.flush();
out.close();
}
catch(IOException ioe)
{
_logClass.warn("IO Exception writing to " + fileName, ioe);
}
}
public String toString()
{
StringBuffer sb = new StringBuffer();
sb.append("URLObject: ");
sb.append(contentType);
if(false)//isHTML() || isXML())
{
sb.append("\n");
sb.append(getStringContent());
}
return sb.toString();
}
public URL getSourceURL()
{
return sourceURL;
}
public String getServiceID()
{
return serviceID;
}
public void setErrorType(int errorType) {
this.errorType = errorType;
}
public int getErrorType() {
return errorType;
}
/**
* 判断本URL对象是否是正常的下载网页对象
*/
public boolean isValidObj() {
if (getErrorType() == _SUCCEED) {
return true;
}
return false;
}
public boolean getIsBeforeSubs()
{
return isBeforeSubs;
}
public void setIsBeforeSubs(boolean isBeforeSubs)
{
this.isBeforeSubs = isBeforeSubs;
}
public static void main(String[] args)
{
String url = "http://cmbw.5200.cn/cp/preI.jsp?gpid=401&PT=ODYxMzU5MDU5MjYwNA**";
// 替换文件中的特殊字符
String[] tagList = {"?", "&", ":", "<", ">", "|", "*"};
for (int i=0; i < tagList.length; i++) {
url = StringUtil.replace(url, tagList[i], URLEncoder.encode(tagList[i]));
}
System.out.println(url);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -