⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 downloadqueue.java

📁 是个java写的sipder,非常不错!能承受很大的压力,每天采集的数量在10000万
💻 JAVA
字号:
/*
 * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
 *
 * Copyright (c) 2001 Brian Pitcher
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

// $Header: /CVSRepository/spider/cn/yicha/subject/spider/DownloadQueue.java,v 1.1 2005/12/01 02:10:19 zhangdi Exp $

package cn.yicha.subject.spider;

import java.util.*;
import java.net.URL;
import java.io.Serializable;
import java.net.MalformedURLException;
import org.apache.log4j.Category;

import cn.yicha.common.util.Log4j;
import cn.yicha.subject.spdier.url.SpUrl;
import cn.yicha.subject.spider.fecther.URLGetter;
import cn.yicha.subject.spider.store.ParseUrl;




public class DownloadQueue implements Serializable
{
    private final static Category _logClass = Category.getInstance(URLGetter.class);
    static {
        Log4j.init();
    }
	
    private SpiderConfig config;

    private List interestingURLsToDownload;
    private List averageURLsToDownload;
    private List boringURLsToDownload;
    private Set urlsInQueue;

    public DownloadQueue(SpiderConfig config)
    {
        this.config = config;
        interestingURLsToDownload = new ArrayList();
        averageURLsToDownload = new ArrayList();
        boringURLsToDownload = new ArrayList();
        urlsInQueue = new HashSet();
    }

    public void queueURL(URLToDownload url)
    {
        URL u = url.getURL();
		
        if(urlsInQueue.contains(u.toExternalForm()))
        {
            return;
        }
		
        if(config.isInteresting(u))
        {
            if(config.isDepthFirstSearch())
            {
                interestingURLsToDownload.add(0, url);
            }
            else
            {
                interestingURLsToDownload.add(url);
            }
        }
        else if(config.isBoring(u))
        {
            if(config.isDepthFirstSearch())
            {
                boringURLsToDownload.add(0, url);
            }
            else
            {
                boringURLsToDownload.add(url);
            }
        }
        else
        {
            if(config.isDepthFirstSearch())
            {
                averageURLsToDownload.add(0, url);
            }
            else
            {
                averageURLsToDownload.add(url);
            }
        }

        urlsInQueue.add(u.toExternalForm());
    }

    public void queueURLs(Collection urls)
    {
        for(Iterator i = urls.iterator(); i.hasNext(); )
        {
            URLToDownload u2d = (URLToDownload) i.next();
            queueURL(u2d);
        }
    }

    /**
    * 把网站列表的所有URL 加入队列
    */
    public void queueURLs(SiteConfig[] websiteConfigs)
    {
    	final int _TOP_LEVEL = 0;
		
	for (int i=0; i < websiteConfigs.length; i++)
	{
		SiteConfig sc = websiteConfigs[i];
		String websiteUrl = sc.getSiteUrl();

		try 
		{
			_logClass.info("queue url "	+ websiteUrl + " ...");
			URL url	= new URL(websiteUrl);

			if (config.downloadMonternet()) {
				// 从内存表中获取URL和ServiceID的对应关系
				String serviceID = SpUrl.getServiceIDFromCache(websiteUrl);
				if (serviceID != null) {
					_logClass.info("serviceID --> " + serviceID);
					queueURL(new URLToDownload( url, _TOP_LEVEL, serviceID, sc.getFilterPattern() ));
				}
			}
			else {
				// 直接把URL放入队列
				String serviceID = "";
				if (config.filterBySecondDomain()) {
					serviceID = ParseUrl.getDomainName(url);
				}
				else {
					serviceID = url.getHost();
				}
				
				queueURL(new URLToDownload(url,	_TOP_LEVEL, serviceID, sc.getFilterPattern() ));
			}
		}
		catch(MalformedURLException murle)
		{
		}
	}
    }
	
    public URLToDownload getNextInQueue()
    {
        if(interestingURLsToDownload.size() > 0)
        {
            return returnURLFrom(interestingURLsToDownload);
        }
        else if(averageURLsToDownload.size() > 0)
        {
            return returnURLFrom(averageURLsToDownload);
        }
        else if(boringURLsToDownload.size() > 0)
        {
            return returnURLFrom(boringURLsToDownload);
        }
        else
        {
            return null;
        }
    }

    private URLToDownload returnURLFrom(List urlList)
    {
        URLToDownload u2d = (URLToDownload) urlList.get(0);

		// 下载梦网时需要把定购地址转换为互联网可访问的地址
		if (config.downloadMonternet()) {
			u2d.applyProperUrl();
		}
		
        urlList.remove(0);
        //urlsInQueue.remove(u2d.getURL());
	urlsInQueue.remove(u2d.getURL().toExternalForm());
        return u2d;
    }

    public int size()
    {
        return interestingURLsToDownload.size() + averageURLsToDownload.size() + boringURLsToDownload.size();
    }

    public String toString()
    {
        return size() + " URLs";
    }
} // End class DownloadQueue

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -