urlobject.java

来自「是个java写的sipder,非常不错!能承受很大的压力,每天采集的数量在100」· Java 代码 · 共 402 行
JAVA
402 行
/*
 * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
 *
 * Copyright (c) 2001 Brian Pitcher
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

// $Header: /CVSRepository/spider/cn/yicha/subject/spider/URLObject.java,v 1.2 2006/02/16 04:35:26 zhangdi Exp $

package cn.yicha.subject.spider;

import org.apache.log4j.Category;

import cn.yicha.common.util.Log4j;
import cn.yicha.common.util.StringUtil;
import cn.yicha.subject.spider.store.ExtractAnchor;



import java.io.*;
import java.net.URL;
import java.net.URLEncoder;


public class URLObject
{
    private final static Category _logClass = Category.getInstance(URLObject.class);
    static {
        Log4j.init();
    }

	// 抓取URL的反馈标志
	public static final int _SUCCEED = 0;
	public static final int _CONNECT_TIMEOUT_EXCEPTION = 1;
	public static final int _FILE_NOT_FOUND_EXCEPTION = 2;
	public static final int _OTHER_EXCEPTION = 3;

	// 定购前后标志目录显示串
	private static final String _BEFORE_SUBS = "beforeSubs";
	private static final String _AFTER_SUBS = "afterSubs";
	
    private URL sourceURL = null;
    private String contentType = "";
    private byte[] content = null;
	
	private String serviceID = "";
	private int errorType = _SUCCEED;
	private boolean isBeforeSubs = true;

    private SpiderConfig config = null;

    public URLObject(URL sourceURL, String contentType, byte[] content, SpiderConfig config, String serviceID, boolean subsStatus)
    {
        this.sourceURL = sourceURL;
        this.contentType = contentType;
        this.content = content;
        this.config = config;

		this.serviceID = serviceID;
		this.isBeforeSubs = subsStatus;
    }

    public URLObject(URL sourceURL, SpiderConfig config, String serviceID, boolean subsStatus)
    {
        this.sourceURL = sourceURL;
        this.config = config;
		this.serviceID = serviceID;
		this.isBeforeSubs = subsStatus;

        String s = sourceURL.toExternalForm().toLowerCase();
        if(s.indexOf(".jpg") != -1)
        {
            contentType = "image/jpeg";
        }
        else if(s.indexOf(".gif") != -1)
        {
            contentType = "image/gif";
        }
        else
        {
            contentType = "text/html";
        }

        if(existsOnDisk())
        {
            File f = new File(convertToFileName());
            if(f.isDirectory())
            {
                f = new File(f, "index.html");
            }
            content = new byte[(int) f.length()];
            try
            {
                FileInputStream in = new FileInputStream(f);
                in.read(content);
                in.close();
            }
            catch(IOException ioe)
            {
                _logClass.warn("IO Exception reading disk version of URL " + sourceURL, ioe);
            }
        }
        else
        {
            content = new byte[0];
        }
    }

	/**
	* 构造函数，并不创建一个真正的网页对象，只是用于返回错误码
	*/
	public URLObject(URL sourceURL, int errorType)
	{
		this.sourceURL = sourceURL;
		this.errorType = errorType;
	}
	
    public String getContentType()
    {
        return contentType;
    }

    public boolean isHTML()
    {
        return contentType.toLowerCase().startsWith("text/html");
    }

    public boolean isXML()
    {
        return contentType.toLowerCase().startsWith("text/xml");
    }

    public boolean isWML()
    {
        return contentType.toLowerCase().startsWith("text/vnd.wap.wml");
    }

    public boolean isXHTMLVer1()
    {
        return contentType.toLowerCase().startsWith("application/vnd.wap");
    }
	
	/**
	* 根据下载页面内容判断是否是XHTML
	*/
	public boolean isXHTMLVer2()
	{
		if (content == null) {
			return false;
		}
		
		String source = new String(content);
		String[] xmlns = ExtractAnchor.extractAttributesFromTag("html", "xmlns", source);
		if (xmlns.length > 0) {
			if (xmlns[0].indexOf("xhtml") >= 0) {
				return true;
			}
		}

		return false;
	}

	/**
	* 判断网页对象是否是正常页面，如果是铃声下载页面，则把URL地址和铃声类型存储到文件
	*/
	public boolean isValidDoc() 
	{
		String contentType = getContentType();
		if (contentType == null) {
			return false;
		}
		
		boolean ret = (isWML() || isXML() || isHTML() || isXHTMLVer1() || isXHTMLVer2());
		if (!ret) {
			_logClass.info("invalid content type --> " + contentType);
		}
		
		return ret;
	}

    public boolean isImage()
    {
        return contentType.startsWith("image/");
    }

    public String getStringContent()
    {
    	String c = null;
    	try {
    		//System.out.println("length of content: " + content.length);
    		c =  new String(content);
    		//System.out.println(c);
    	} catch (Exception e) {
    		e.printStackTrace();
    		System.exit(0);
    	}
    	return c; 
    }

	/**
	* 取得文件存储路径
	*/
	public String getFilePath()
	{
		// 定购前后标志需添加目录，以便分析
		String subs = "";
		if (getIsBeforeSubs()) {
			subs = _BEFORE_SUBS;
		}
		else {
			subs = _AFTER_SUBS;
		}

		String filePath;
		if (config.downloadMonternet()) {
        	filePath = config.getSaveRootDirectory().getPath() + "/" + getServiceID() + "/" + subs;
		}
		else {
			filePath = config.getSaveRootDirectory().getPath();
		}

		return filePath;
	}

	/**
	* 把URL转换为可存储的文件名
	*/
    public String convertToFileName()
    {
        String url = sourceURL.toExternalForm();

		// 去除HTTP前缀
        int httpIdx = url.indexOf("http://");
        if(httpIdx == 0) {
            url = url.substring(7);
        }
		
        // Check for at least one slash -- otherwise host name (e.g. sourceforge.net)
        if(url.indexOf("/") < 0) {
            url = url + "/";
        }
		
        // If trailing slash, add index.wml as default
        if(url.endsWith("/")) {
            url = url + "index.wml";
        }

		// _logClass.info("raw url --> " + url);

		// 替换文件中的特殊字符
		String[] tagList = {"?", "&", ":", "<", ">", "|"};
		for (int i=0; i < tagList.length; i++) {
        	url = StringUtil.textReplace(tagList[i], URLEncoder.encode(tagList[i]), url);
		}

		// 去除文件中的特殊字符
		String[] delList = {"*"};
		for (int j=0; j < delList.length; j++) {
        	url = StringUtil.textReplace(delList[j], "", url);
		}
		
		// _logClass.info("transformed url --> " + url);
		
        return getFilePath() + "/" + url;
    }

    public boolean existsOnPrevLog() {
    	String url = sourceURL.toExternalForm();
    	return config.getHsPreLinks().contains(url);
    }
    
    public boolean existsOnDisk()
    {
        File f = new File(convertToFileName());
        return (f.exists() && !f.isDirectory());
    }

    public void writeToFile()
    {
        writeToFile(convertToFileName());
    }

    public void writeToFile(String fileName)
    {
    	String rootPath = fileName.split("%3F|\\?")[0];
    	String subPath = fileName.substring(rootPath.length());
    	// 在写入之前对参数字符串中的特殊字符做替换
		String[] tagList = {"/", "\\", ":", "*", "?", "&", ":", "<", ">", "|" };
		for (int i = 0; i < tagList.length; i++) {
			subPath = StringUtil.textReplace(tagList[i], 
					URLEncoder.encode(tagList[i]),
					subPath);
		}
		if (subPath.length() > 128) {
			_logClass.info("too long path...");
			errorType = URLObject._OTHER_EXCEPTION;
			return;
		}
		
		fileName = rootPath.concat(subPath);
        _logClass.info("writeToFile(" + fileName + ")");
        
        try
        {
            File f = new File(fileName);
            File pf = f.getParentFile();
            pf.mkdirs();
            f.createNewFile();
            FileOutputStream out = new FileOutputStream(fileName);
            out.write(content);
            out.flush();
            out.close();
        }
        catch(IOException ioe)
        {
            _logClass.warn("IO Exception writing to " + fileName, ioe);
        }
    }

    public String toString()
    {
        StringBuffer sb = new StringBuffer();
        sb.append("URLObject: ");
        sb.append(contentType);
        if(false)//isHTML() || isXML())
        {
            sb.append("\n");
            sb.append(getStringContent());
        }
        return sb.toString();
    }

    public URL getSourceURL()
    {
    	return sourceURL;
    }

	public String getServiceID()
	{
		return serviceID;
	}

	public void setErrorType(int errorType) {
		this.errorType = errorType;
	}

	public int getErrorType() {
		return errorType;
	}

	/**
	* 判断本URL对象是否是正常的下载网页对象
	*/
	public boolean isValidObj() {
		if (getErrorType() == _SUCCEED) {
			return true;
		}

		return false;
	}

	public boolean getIsBeforeSubs()
	{
		return isBeforeSubs;
	}

	public void setIsBeforeSubs(boolean isBeforeSubs)
	{
		this.isBeforeSubs = isBeforeSubs;
	}

	public static void main(String[] args)
	{
		String url = "http://cmbw.5200.cn/cp/preI.jsp?gpid=401&amp;PT=ODYxMzU5MDU5MjYwNA**";
		
		// 替换文件中的特殊字符
		String[] tagList = {"?", "&", ":", "<", ">", "|", "*"};
		for (int i=0; i < tagList.length; i++) {
        	url = StringUtil.replace(url, tagList[i], URLEncoder.encode(tagList[i]));
		}

		System.out.println(url);
	}
}
urlobject.java - 源码说明

本页面展示了「是个java写的sipder,非常不错!能承受很大的压力,每天采集的数量在10000万」中的 urlobject.java 源码文件，采用 Java 编程语言编写，共 402 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与sipder相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?