📄 urltodownload.java

📁 是个java写的sipder,非常不错!能承受很大的压力,每天采集的数量在10000万
💻 JAVA
字号:
/*
 * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
 *
 * Copyright (c) 2001 Brian Pitcher
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

// $Header: /CVSRepository/spider/cn/yicha/subject/spider/URLToDownload.java,v 1.3 2006/02/16 06:57:00 zhangdi Exp $

package cn.yicha.subject.spider;

import java.util.Set;
import java.net.URL;
import java.net.MalformedURLException;
import java.net.URLDecoder;

import org.apache.log4j.Category;

import cn.yicha.common.util.Log4j;
import cn.yicha.common.util.StringParser;
import cn.yicha.subject.spdier.url.SpUrl;




public class URLToDownload implements java.io.Serializable
{
    private URL url;
    private final URL referer;
    private final int depth;
    private String serviceID;		// 对梦网搜索而言，serviceID表示SP服务标识，对免费网页搜索而言，serviceID表示二级域名
	private boolean isBeforeSubs = true;
	private URL endUrl;				// 如果URL不变，则是原有的URL，否则是重定向之后的URL
	private String filterPattern;	// URL过滤模式串

    private final static Category _logClass = Category.getInstance(URLToDownload.class);

    static
    {
        Log4j.init();
    }

    public URLToDownload(URL url, int depth, String serviceID)
    {
        this(url, null, depth, serviceID, null, true);
    }
	
    public URLToDownload(URL url, int depth, String serviceID, boolean subsStatus)
    {
        this(url, null, depth, serviceID, null, subsStatus);
    }

    public URLToDownload(URL url, int depth, String serviceID, String filterPattern)
    {
        this(url, null, depth, serviceID, filterPattern, true);
    }

    public URLToDownload(URL url, URL referer, int depth, String serviceID, String filterPattern, boolean subsStatus)
    {
        this.url = url;
        this.referer = referer;
        this.depth = depth;
        this.serviceID = serviceID;
		this.filterPattern = filterPattern;
		this.isBeforeSubs = subsStatus;
    }

    public URL getURL()
    {
        return url;
    }

    public URL getReferer()
    {
        return referer;
    }

	public URL getEndURL()
	{
		return endUrl;
	}

	public void setEndURL(URL endUrl)
	{
		this.endUrl = endUrl;
	}
	
    public int getDepth()
    {
        return depth;
    }

    public String getServiceID()
    {
        return serviceID;
    }

	public void setServiceID(String serviceID)
	{
		this.serviceID = serviceID;
	}

	public boolean getIsBeforeSubs()
	{
		return isBeforeSubs;
	}

	public void setIsBeforeSubs(boolean isBeforeSubs)
	{
		this.isBeforeSubs = isBeforeSubs;
	}

	public String getFilterPattern()
	{
		return filterPattern;
	}

	public void setFilterPattern(String filterPattern)
	{
		this.filterPattern = filterPattern;
	}
	
    public String toString()
    {
        return url + ", referer " + referer + ", depth " + depth;
    }

    public boolean isHtml()
    {
        String str = url.toExternalForm().toLowerCase();
        if (str.indexOf(".htm") > 0)
	 		return true;

		return false;
    }

    public static boolean isMatchUrl(String pattern, String url) {
    	//return url.indexOf(pattern) >= 0;
    	
    	if (pattern.length() <= 0) {
    		return false;
    	}
    	
    	String[] subPatterns = pattern.split("\\*");
    	
    	for (int i = 0; i < subPatterns.length; i ++) {
    		String sp = subPatterns[i];
    		// 找子串
    		int pos = url.indexOf(sp);
    		if (pos >= 0) {	// 是否有匹配
    			url = url.substring(pos + sp.length());
    		} else {
    			return false;
    		}
    	}
    	
    	return true;
    }
    
//    public static void main(String[] args) {
//    	String url = "http://www.joyes.com/(aadsfdafa)/game/a.jsp?lxt=ads&id=888";
//    	
//    	String pattern = "www.*yes.com/*)/game/a.jsp?lxt=*&id=";
//    	
//    	System.out.println(isMatchUrl(pattern, url));
//    }
    
	/**
	* 判断URL地址是否匹配设置的过滤模式
	*/
	public boolean isSatisfiedUrl()
	{
		final char _NOT_MATCH_PREFIX = '!';
		
		// 如果为空，则任何均不匹配
		if (getFilterPattern() == null) {
			return false;
		}
		
		String urlAddr = url.toExternalForm().toLowerCase();
		String[] patterns = getFilterPattern().split(";");
		
		// 判断是否匹配模式：
		// 1。在非匹配模式中的，排除；
		// 2。在匹配模式中同时不在非匹配模式中的，保留；
		// 3。其余排除。
		boolean haveMatched = false;
		for (int i=0; i < patterns.length; i++)
		{
			String pattern = patterns[i].toLowerCase();
			if (pattern.charAt(0) == _NOT_MATCH_PREFIX)	// 属于非匹配模式
			{
				pattern = pattern.substring(1);
				if (isMatchUrl(pattern, urlAddr)){ // 直接排除
					return false;
				}
			} else {
				if (isMatchUrl(pattern, urlAddr)){
					haveMatched = true; // 匹配则标记已有匹配
				}
			}			
		}

		return haveMatched;
	}

	/**
	* 判断URL地址是否是能够下载的合法地址
	*/
	public boolean isValidUrl(boolean downloadMonternet)
	{
		// 判断URL地址是否匹配设置的过滤模式
		if (!isSatisfiedUrl()) {
			return false;
		}
		
		// 判断是否匹配默认不必访问的地址模式
		String pattern;
		if (downloadMonternet) {
			pattern = "wap.monternet.com/*\\s*$";
		}
		else {
			pattern = "wap.monternet.com";
		}
		
		if (StringParser.matchPattern(getURL().toExternalForm(), pattern)) {
			return false;
		}

		return true;
	}

	/**
	* 判断URL地址是否是能够下载的合法地址
	*/
	public boolean isValidUrl()
	{
		// 判断URL地址是否匹配设置的过滤模式
		return isSatisfiedUrl();
	}
	
	public boolean isValidAnchor(Set ringExtensions, Set gameExtensions, Set invalidExtensions)
	{
		/*String urlAddr = url.toExternalForm();
		
		// 去除URL后缀不是多媒体格式，但含有多媒体格式串的URL
		String[] invalidSuffix = (String[]) invalidExtensions.toArray(new String[0]);
		for (int m=0; m < invalidSuffix.length; m++)
		{
			String suffix = "." + invalidSuffix[m];
			if (urlAddr.toLowerCase().indexOf(suffix) >= 0) {
				_logClass.info("invalid anchor --> " + urlAddr);
				return false;
			}
		}

		String urlSuffix = getUrlSuffix(urlAddr.toLowerCase());

		// 判断游戏后缀
		String[] extensions = (String[]) gameExtensions.toArray(new String[0]);
		for (int i=0; i < extensions.length; i++) {
			String suffix = "." + extensions[i];
			if (urlSuffix.indexOf(suffix.toLowerCase()) >= 0) {
				_logClass.info("[game anchor] --> " + urlAddr);
				return false;
			}
		}
		
		// 判断铃声后缀
		extensions = (String[]) ringExtensions.toArray(new String[0]);
		for (int i=0; i < extensions.length; i++) {
			String suffix = "." + extensions[i];
			if (urlSuffix.indexOf(suffix.toLowerCase()) >= 0) {
				_logClass.info("[ring anchor] --> " + urlAddr);
				return false;
			}
		}*/

		return true;
	}

	/**
	* 分析获取目的URL后缀
	*/
	private String getUrlSuffix(String urlAbs) 
	{
		final String _DEFAULT_SUFFIX = "wml";
		final String _HTTP_PREFIX = "http://";
		
		String suffix = _DEFAULT_SUFFIX;

		try {
			urlAbs = URLDecoder.decode(urlAbs);
		}
		catch (Exception ex) {
			ex.printStackTrace();
			_logClass.info("exception url --> " + urlAbs);
		}

		// 去除尾部的"/"
		if (urlAbs.endsWith("/")) {
			urlAbs = urlAbs.substring(0, urlAbs.length() - 1);
		}

		// 取?或;之前的URL路径
		int pos1 = urlAbs.indexOf("?");
		if (pos1 >= 0) {
			urlAbs = urlAbs.substring(0, pos1);
		}

		pos1 = urlAbs.indexOf(";");
		if (pos1 >= 0) {
			urlAbs = urlAbs.substring(0, pos1);
		}
		

		// 过滤掉http://前缀
		if (urlAbs.startsWith(_HTTP_PREFIX)) {
			urlAbs = urlAbs.substring(_HTTP_PREFIX.length());
		}

		// 取最后出现的"/"
		int pos2 = urlAbs.lastIndexOf("/");
		if (pos2 >= 0) {
			urlAbs = urlAbs.substring(pos2);
		}
		else {
			// 返回wml后缀
			return suffix;
		}

		int dotPos = urlAbs.lastIndexOf(".");
		if (dotPos >= 0) {
			suffix = urlAbs.substring(dotPos);
		}

		// 过滤#及后缀
		pos2 = suffix.indexOf("#");
		if (pos2 >= 0) {
			suffix = suffix.substring(0, pos2);
		}

		return suffix;
	}
	
	/**
	* 如果是梦网地址，把地址转换为互联网可采集的地址
	*/
	public void applyProperUrl()
	{
		String properUrl = getURL().toExternalForm();
		_logClass.debug("apply proper url --> " + properUrl);
		
		if (SpUrl.isMonternetUrl(properUrl)) {
			properUrl = SpUrl.transSpUrl(properUrl);
			_logClass.info("trans url --> " + properUrl);
			
			try {
				this.url = new URL(properUrl);
			}
			catch (MalformedURLException ex) {
				ex.printStackTrace();
			}
		}
	}

	/**
	* 根据URL地址，提取URL中的ServiceID并返回，如果没有则返回空串
	*/
	public String fetchServiceIDFromUrl()
	{
		String serviceID = null;
		
		String sourceUrl = getURL().toExternalForm();
		if (SpUrl.isMonternetUrl(sourceUrl)) {
			serviceID = SpUrl.getServiceIDFromUrl(sourceUrl);
		}

		return serviceID;
	}
}
💿 文件大小 53 K
👤 上传用户 god_dog
📂 所属分类 Java编程
🏷️ 相关标签

#sipder #10000 #java #压力
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -