📄 spider.java

📁 是个java写的sipder,非常不错!能承受很大的压力,每天采集的数量在10000万
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*
 * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
 *
 * Copyright (c) 2001 Brian Pitcher
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

// $Header: /CVSRepository/spider/cn/yicha/subject/spider/Spider.java,v 1.1 2005/12/01 02:10:19 zhangdi Exp $

package cn.yicha.subject.spider;

import java.util.*;
import java.io.*;
import java.lang.Runtime;
import java.lang.System;
import java.sql.SQLException;
import java.net.URL;

import org.apache.log4j.Category;

import cn.yicha.common.util.Log4j;
import cn.yicha.common.util.Logger;
import cn.yicha.subject.spider.extractor.HTMLParser;
import cn.yicha.subject.spider.fecther.URLGetter;
import cn.yicha.subject.spider.store.*;
import cn.yicha.subject.spider.ui.FileSpider;
import cn.yicha.subject.spider.writer.NonPageLinkLog;



public class Spider extends Logger implements Runnable, Constants
{
    /** Config for the spider */
    private SpiderConfig config;
    /**
     * Download queue.
     * Thread safety: To access the queue, first synchronize on it.
     */
    private DownloadQueue queue;
    /**
     * Set of URLs downloaded or scheduled, so we don't download a
     * URL more than once.
     * Thread safety: To access the set, first synchronize on it.
     */
    private Set urlsDownloadedOrScheduled;
    /**
     * Set of URLs currently being downloaded by Spider threads.
     * Thread safety: To access the set, first synchronize on it.
     */
    private Set urlsDownloading;
    /**
     * Number of downloads currently taking place.
     * Thread safety: To modify this value, first synchronize on
     *                the download queue.
     */
    private int downloadsInProgress;
    /** Whether the spider should quit */
    private boolean quit;
    /** Count of running Spider threads. */
    //*private int running;
	private static int running;
    /** Time we last checkpointed. */
    private long lastCheckpoint;

	private Set urlsError;				// 下载出错的页面集合
    private Object exitLock;			// 线程退出时的锁标志

    public Spider(SpiderConfig config)
    {
        this.config = config;
        queue = new DownloadQueue(config);

        // 向队列添加待下载网站列表
        queue.queueURLs(config.getWebsites());
		
        urlsDownloadedOrScheduled = new HashSet();
        urlsDownloading = new HashSet();
        downloadsInProgress = 0;
        lastCheckpoint = 0;

		urlsError = new HashSet();

		exitLock = new Object();
    }

    private void printMemory(String para)
    {
        System.out.println(para + "--> 内存数: " + Runtime.getRuntime().freeMemory());
    }
	
    public void start()
    {
        quit = false;
        running = 0;

	for(int i = 0; i < config.getSpiderThreads(); i++)
       {
            _logClass.info("Starting Spider thread");
            Thread t = new Thread(this, "Spider-Thread-" + (i + 1));
            t.start();
            running++;
        }
    }

    public void stop()
    {
        quit = true;
    }

    public boolean isRunning()
    {
       return running == 0;
    }

	public int getRunning()
	{
		return running;
	}

    private void checkpointIfNeeded()
    {
        if(config.getCheckpointInterval() == 0)
        {
            return;
        }

        if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
        {
            synchronized(queue)
            {
                if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
                {
                    writeCheckpoint();
                    lastCheckpoint = System.currentTimeMillis();
                }
            }
        }
    }

    private void writeCheckpoint()
    {
        //_logClass.debug("writeCheckpoint()");
        try
        {
            FileOutputStream fos = new FileOutputStream("spider.checkpoint", false);
            ObjectOutputStream oos = new ObjectOutputStream(fos);
            oos.writeObject(queue);
            oos.writeObject(urlsDownloading);
            oos.close();
        }
        catch(IOException ioe)
        {
            // _logClass.warn("IO Exception attempting checkpoint: " + ioe.getMessage(), ioe);
        }
    }

    public void readCheckpoint()
    {
        try
        {
            FileInputStream fis = new FileInputStream("spider.checkpoint");
            ObjectInputStream ois = new ObjectInputStream(fis);
            queue = (DownloadQueue) ois.readObject();
            urlsDownloading = (Set) ois.readObject();
            queue.queueURLs(urlsDownloading);
            urlsDownloading.clear();
        }
        catch(Exception e)
        {
            // _logClass.error("Caught exception reading checkpoint: " + e.getMessage(), e);
        }
    }

	/**
	* 每个线程的执行函数
	*/
    public void run()
    {
    	//final int _MAX_TRY_COUNT = 8;
		final int _MAX_TRY_COUNT = 3;

		int queueEmptyTryCount = 0;

		Date spiderStartTime = new Date();		// 下载线程开始时间
		int spiderUrlCount = 0;					// 下载页面个数
		
		HTMLParser htmlParser =	new HTMLParser(config);
		URLGetter urlGetter = new URLGetter(config);

		// while((queueSize() > 0 || downloadsInProgress >	0) && quit == false && queueEmptyTryCount <= _MAX_TRY_COUNT)
		while(queueEmptyTryCount <= _MAX_TRY_COUNT)			
		{
			// 保存当前下载堆栈状态
		    checkpointIfNeeded();

			// 对下载时间和下载页面数作限制
			Date spiderCurrTime = new Date();
			if (spiderCurrTime.getTime() - spiderStartTime.getTime() > config.getMaxDownloadTime() * 1000) {
				_logClass.warn("time expired ...");
				break;
			}

			if (++spiderUrlCount > config.getMaxDownloadPages()) {
				_logClass.warn("fetched urls exceeded ...");
				break;
			}
			
			// 判断下载任务是否全部完毕
		    if(queueSize() == 0	&& downloadsInProgress > 0)
		    {
				// Wait for a download to finish before seeing if this thread should stop
				try {
				    Thread.sleep(QUEUE_CHECK_INTERVAL);
				}
				catch(InterruptedException ignored) {
				}
				// Have another go at the loop
				continue;
		    }
		    else if(queueSize()	== 0)
		    {
		    	// 防止线程在开始下载时即退出
		    	synchronized(exitLock) {
		    		queueEmptyTryCount++;
		    }
				
			if (queueEmptyTryCount > _MAX_TRY_COUNT) {
				break;
			}
			else {
				try {
					_logClass.info("sleep here ...");
				    Thread.sleep(QUEUE_CHECK_INTERVAL);
				}
				catch(InterruptedException ignored) {
				}
				continue;
			}
		}

			// 取得待下载网页URL
		    URLToDownload nextURL;
		    synchronized(queue)
		    {
				nextURL	= queue.getNextInQueue();
				if (nextURL == null) {
					if (queueEmptyTryCount > _MAX_TRY_COUNT) {
						break;
12 下一页
💿 文件大小 53 K
👤 上传用户 god_dog
📂 所属分类 Java编程
🏷️ 相关标签

#sipder #10000 #java #压力
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -