⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 spiderconfig.java

📁 是个java写的sipder,非常不错!能承受很大的压力,每天采集的数量在10000万
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/*
 * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
 *
 * Copyright (c) 2001 Brian Pitcher
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

// $Header: /CVSRepository/spider/cn/yicha/subject/spider/SpiderConfig.java,v 1.1 2005/12/01 02:10:19 zhangdi Exp $

package cn.yicha.subject.spider;


import java.io.*;
import java.util.*;
import java.net.URL;
import java.net.MalformedURLException;

import cn.yicha.common.util.Logger;
import cn.yicha.subject.spider.wapsite.*;


public class SpiderConfig extends Logger implements Serializable
{
	private final static int _DEFAULT_MAX_DOWNLOAD_TIME = 5 * 3600;		// 默认最大下载时间
	private final static int _DEFAULT_MAX_DOWNLOAD_PAGES = 20000;		// 默认最大下载页面数
	private final static int _DEFAULT_MAX_TRY_COUNT = 1;				// 默认最大下载次数

    private File saveRootDirectory;
    private File mailtoLogFile;

    private boolean refreshHTMLs;
    private boolean refreshImages;
    private boolean refreshOthers;
	
    private Set htmlExtensions;
    private Set imageExtensions;
	private Set ringExtensions;
	private Set gameExtensions;
	private Set invalidExtensions;			// 无效链接后缀,例如图片和铃声不予下载处理

    private URL startLocation;
    private String urlMatch;

    private List interestingURLSubstrings;
    private List boringURLSubstrings;

    private boolean depthFirst;
    private int maxDepth;

    private String userAgent;

    private String basicAuthUser;
    private String basicAuthPassword;

    private String websiteFile;						// 网站配置文件
    private HashSet _websiteHash = null;				// 下载网站列表
    private File saveAnchorDirectory;				// 锚点属性存储路径
    private boolean downloadMonternet;				// 下载免费站还是梦网站的标记
    private String saveRingFile;					// 铃声数据存储文件
    private String saveGameFile;					// 游戏数据存储文件
	private boolean filterBySecondDomain;			// 网站是否按二级域名过滤
	private boolean overrideLog;					// 日志覆盖标志
	private Set mobileNos;							// 模拟手机号
	private int maxTryCount;						// 最大尝试连接次数

	private int maxDownloadTime;			// 下载时间限制,以秒为单位
	private int maxDownloadPages;			// 下载页面个数限制

	private String proxyHost;				// 代理服务器地址
	private String proxyPort;				// 代理服务器端口
    private int spiderThreads;

    private long checkpointInterval;

	private int _mobile_no_index = 0;		// 手机号索引
	
	private HashSet _hsPreLinks = null;

    /**
     * Create a default config.
     */
    public SpiderConfig()
    {
        // _logClass.debug("SpiderConfig()");

        saveRootDirectory = new File(".");
		saveAnchorDirectory = new File(".");
        mailtoLogFile = new File("mailto.txt");

        refreshHTMLs = true;
        refreshImages = false;
        refreshOthers = false;
		
		filterBySecondDomain = true;
		downloadMonternet = false;
		overrideLog = true;

        htmlExtensions = new HashSet();
        htmlExtensions.add("htm");
        htmlExtensions.add("html");
        htmlExtensions.add("shtml");

        imageExtensions = new HashSet();
        imageExtensions.add("jpg");
        imageExtensions.add("gif");
        imageExtensions.add("png");

		invalidExtensions = new HashSet();
        invalidExtensions.add("jpg");
        invalidExtensions.add("gif");
        invalidExtensions.add("png");

		ringExtensions = new HashSet();
		ringExtensions.add("amr");
		ringExtensions.add("mid");

		gameExtensions = new HashSet();
		gameExtensions.add("jar");
		gameExtensions.add("jad");
		gameExtensions.add("sis");

        urlMatch = null;
        interestingURLSubstrings = new ArrayList();
        boringURLSubstrings = new ArrayList();
        depthFirst = false;
        maxDepth = 0;

       //userAgent ="MOT-E680/R51_G_0F.42.A1P MIB/2.2 Profile/MIDP-2.0 Configuration/CLDC-1.1";
	userAgent = "WebLech Spider 0.01alpha";
	// userAgent = "Nokia6108/1.0 (05.04) Profile/MIDP-1.0 Configuration/CLDC-1.0";
        basicAuthUser = "";
        basicAuthPassword = "";

        spiderThreads = 1;

        checkpointInterval = 0;
    }

    /**
     * Create a config from a java.util.Properties object.
     */
    public SpiderConfig(Properties props)
    {
        _logClass.info("start to config spider");
        _logClass.debug("SpiderConfig(props)");

		// 创建下载文件存储目录,
	 saveRootDirectory = new File(props.getProperty("saveRootDirectory", "."));
        if(!saveRootDirectory.exists())
        {
            if(!saveRootDirectory.mkdirs())
            {
                _logClass.error("Couldn't create root directory: " + saveRootDirectory);
                _logClass.info("Defaulting to . instead");
                saveRootDirectory = new File(".");
            }
        }
        else if(!saveRootDirectory.isDirectory())
        {
            _logClass.error("Save root is not a directory: " + saveRootDirectory);
            _logClass.info("Defaulting to . instead");
            saveRootDirectory = new File(".");
        }

		// 创建锚点属性存储目录
        saveAnchorDirectory = new File(props.getProperty("saveAnchorDirectory", "."));
        if(!saveAnchorDirectory.exists())
        {
            if(!saveAnchorDirectory.mkdirs())
            {
                _logClass.error("Couldn't create anchor directory: " + saveAnchorDirectory);
                _logClass.info("Defaulting to . instead");
                saveAnchorDirectory = new File(".");
            }
        }
        else if(!saveAnchorDirectory.isDirectory())
        {
            _logClass.error("Save anchor is not a directory: " + saveAnchorDirectory);
            _logClass.info("Defaulting to . instead");
            saveAnchorDirectory = new File(".");
        }

        String mailtoFileStr = props.getProperty("mailtoLogFile", "mailto.txt");
        // Check if absolute or relative name given
        if(mailtoFileStr.indexOf(":") != -1 || mailtoFileStr.startsWith("/") || mailtoFileStr.startsWith("\\"))
        {
            _logClass.debug("Using absolute file name " + mailtoFileStr);
            mailtoLogFile = new File(mailtoFileStr);
        }
        else
        {
            _logClass.debug("Constructing relative file name " + saveRootDirectory.getPath() + "/" + mailtoFileStr);
            mailtoLogFile = new File(saveRootDirectory.getPath() + "/" + mailtoFileStr);
        }

        refreshHTMLs = Boolean.valueOf(props.getProperty("refreshHTMLs", "true")).booleanValue();
        refreshImages = Boolean.valueOf(props.getProperty("refreshImages", "false")).booleanValue();
        refreshOthers = Boolean.valueOf(props.getProperty("refreshOthers", "false")).booleanValue();

		// 下载梦网或是免费网站标记
        downloadMonternet = Boolean.valueOf(props.getProperty("downloadMonternet", "false")).booleanValue();

		// 系统是按二级域名过滤还是三级域名过滤标志
		filterBySecondDomain = Boolean.valueOf(props.getProperty("filterBySecondDomain", "true")).booleanValue();

		// 日志覆盖标志
		overrideLog = Boolean.valueOf(props.getProperty("overrideLog", "true")).booleanValue();

        htmlExtensions = parseSet(props.getProperty("htmlExtensions", "htm,html,shtml"));
        imageExtensions = parseSet(props.getProperty("imageExtensions", "jpg,gif,png"));
		invalidExtensions = parseSet(props.getProperty("invalidExtensions", "jpg,gif,png"));
		ringExtensions = parseSet(props.getProperty("ringExtensions", "mid,amr"));
		gameExtensions = parseSet(props.getProperty("gameExtensions", "jar,jad,sis"));

		// 采集手机号
		mobileNos = parseSet(props.getProperty("mobileNos", "13968873803,13950428251"));

        String startLocStr = props.getProperty("startLocation");
        if(startLocStr != null)
        {
            try {
                startLocation = new URL(startLocStr);
            }
            catch(MalformedURLException murle) {
                _logClass.error("Caught MalformedURLException parsing start URL '" + startLocStr + "' : " + murle.getMessage(), murle);
            }
        }
        else
        {
            _logClass.warn("startLocation not found in properties");
        }

        urlMatch = props.getProperty("urlMatch");

        interestingURLSubstrings = parsePropCommaSeparated(props.getProperty("interestingURLs"));
        boringURLSubstrings = parsePropCommaSeparated(props.getProperty("boringURLs"));

        depthFirst = Boolean.valueOf(props.getProperty("depthFirst", "false")).booleanValue();
        try {
            String maxDepthStr = props.getProperty("maxDepth", "0");
            maxDepth = Integer.parseInt(maxDepthStr);
        }
        catch(NumberFormatException nfe) {
            _logClass.error("Caught number format exception parsing max depth, defaulting to 1", nfe);
            maxDepth = 1;
        }

		// 获取最大下载时间
        try {
            String maxDownloadTimeStr = props.getProperty("maxDownloadTime", String.valueOf(_DEFAULT_MAX_DOWNLOAD_TIME));
            maxDownloadTime = Integer.parseInt(maxDownloadTimeStr);
        }
        catch (NumberFormatException nfe) {
            _logClass.error("Caught number format exception parsing max download time, defaulting to " + _DEFAULT_MAX_DOWNLOAD_TIME, nfe);
            maxDownloadTime = _DEFAULT_MAX_DOWNLOAD_TIME;
        }

		// 获取最大下载页面数
        try {
            String maxDownloadPagesStr = props.getProperty("maxDownloadPages", String.valueOf(_DEFAULT_MAX_DOWNLOAD_PAGES));
            maxDownloadPages = Integer.parseInt(maxDownloadPagesStr);
        }
        catch (NumberFormatException nfe) {
            _logClass.error("Caught number format exception parsing max download pages, defaulting to " + _DEFAULT_MAX_DOWNLOAD_PAGES, nfe);
            maxDownloadPages = _DEFAULT_MAX_DOWNLOAD_PAGES;
        }

		// 获取最大尝试下载次数
		try {
			String maxTryCountStr = props.getProperty("maxTryCount", String.valueOf(_DEFAULT_MAX_TRY_COUNT));
			maxTryCount = Integer.parseInt(maxTryCountStr);
		}
		catch (NumberFormatException nfe) {
            _logClass.error("Caught number format exception parsing max download try count, defaulting to " + _DEFAULT_MAX_TRY_COUNT, nfe);
            maxTryCount = _DEFAULT_MAX_TRY_COUNT;
		}

		// 获取代理服务器地址和端口
		proxyHost = props.getProperty("proxyHost", "");
		proxyPort = props.getProperty("proxyPort", "");
		
        userAgent = props.getProperty("userAgent", "WebLech Spider 0.01alpha");
        basicAuthUser = props.getProperty("basicAuthUser", "");
        basicAuthPassword = props.getProperty("basicAuthPassword", "");
		websiteFile = props.getProperty("websiteFile", "config/website.conf");
		saveRingFile = props.getProperty("saveRingFile", "logs/ringlog.txt");
		saveGameFile = props.getProperty("saveGameFile", "logs/gamelog.txt");

        try
        {
            String threadsStr = props.getProperty("spiderThreads", "1");
            spiderThreads = Integer.parseInt(threadsStr);
        }
        catch(NumberFormatException nfe)
        {
            _logClass.error("Caught number format exception parsing number of threads, defaulting to 1", nfe);
            spiderThreads = 1;
        }

        try
        {
            String intervalStr = props.getProperty("checkpointInterval", "0");
            checkpointInterval = Long.parseLong(intervalStr);
        }
        catch(NumberFormatException nfe)
        {
            _logClass.error("Caught number format exception parsing checkpoint interval, defaulting to 0", nfe);
            checkpointInterval = 0;
        }
    }

    private List parsePropCommaSeparated(String str)
    {
        ArrayList result = new ArrayList();
        if(str != null && str.length() > 0)
        {
            StringTokenizer tok = new StringTokenizer(str, ",");
            while(tok.hasMoreTokens())
            {
                result.add(tok.nextToken());
            }
        }
        return result;
    }

    /**
    * 解析待下载的网站列表
    */
    public boolean parseWebsites(String websiteFile)
    {
    	_logClass.info("enter parseWebsites");
        // 装载网站属性
        try {
			FileReader fr = new FileReader(websiteFile); 
			BufferedReader br = new BufferedReader(fr);
			_websiteHash = new HashSet();

			// 读取网站列表
			String line = br.readLine();
			while (line != null) {
				String[] substrs = line.split("\\s+");
				if (substrs.length <= 0) {
					return false;
				}
				_logClass.info("web name: " + substrs[0]);
				
				// 设置网站地址和过滤模式
				SiteConfig sc = new SiteConfig();
				if (substrs.length >= 1) {
					sc.setSiteUrl(substrs[0]);
				}

				if (substrs.length >= 2) {
					sc.setFilterPattern(substrs[1]);
				}

				_websiteHash.add(sc);
				line = br.readLine();
			}

			br.close();
			fr.close();
        }
	 	catch(Exception e) {
	 		return false;
		}

		return true;
    }

    public SiteConfig[] getWebsites()
    {
        return (SiteConfig[]) _websiteHash.toArray(new SiteConfig[0]);
    }

	/**
	* 随机取得手机号
	*/
	public synchronized String getRandomMobileNo() {
		String[] mobileNoArray = (String[]) mobileNos.toArray(new String[0]);
		_mobile_no_index = (_mobile_no_index + 1) % mobileNoArray.length;

		return (mobileNoArray[_mobile_no_index]);
	}
	
    public void setRefreshHTMLs(boolean refreshHTMLs)
    {
        this.refreshHTMLs = refreshHTMLs;
    }

    public boolean refreshHTMLs()
    {
        return refreshHTMLs;
    }

    public void setRefreshImages(boolean refreshImages)
    {
        this.refreshImages = refreshImages;
    }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -