📄 spider.java
字号:
/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
// $Header: /CVSRepository/spider/cn/yicha/subject/spider/Spider.java,v 1.1 2005/12/01 02:10:19 zhangdi Exp $
package cn.yicha.subject.spider;
import java.util.*;
import java.io.*;
import java.lang.Runtime;
import java.lang.System;
import java.sql.SQLException;
import java.net.URL;
import org.apache.log4j.Category;
import cn.yicha.common.util.Log4j;
import cn.yicha.common.util.Logger;
import cn.yicha.subject.spider.extractor.HTMLParser;
import cn.yicha.subject.spider.fecther.URLGetter;
import cn.yicha.subject.spider.store.*;
import cn.yicha.subject.spider.ui.FileSpider;
import cn.yicha.subject.spider.writer.NonPageLinkLog;
public class Spider extends Logger implements Runnable, Constants
{
/** Config for the spider */
private SpiderConfig config;
/**
* Download queue.
* Thread safety: To access the queue, first synchronize on it.
*/
private DownloadQueue queue;
/**
* Set of URLs downloaded or scheduled, so we don't download a
* URL more than once.
* Thread safety: To access the set, first synchronize on it.
*/
private Set urlsDownloadedOrScheduled;
/**
* Set of URLs currently being downloaded by Spider threads.
* Thread safety: To access the set, first synchronize on it.
*/
private Set urlsDownloading;
/**
* Number of downloads currently taking place.
* Thread safety: To modify this value, first synchronize on
* the download queue.
*/
private int downloadsInProgress;
/** Whether the spider should quit */
private boolean quit;
/** Count of running Spider threads. */
//*private int running;
private static int running;
/** Time we last checkpointed. */
private long lastCheckpoint;
private Set urlsError; // 下载出错的页面集合
private Object exitLock; // 线程退出时的锁标志
public Spider(SpiderConfig config)
{
this.config = config;
queue = new DownloadQueue(config);
// 向队列添加待下载网站列表
queue.queueURLs(config.getWebsites());
urlsDownloadedOrScheduled = new HashSet();
urlsDownloading = new HashSet();
downloadsInProgress = 0;
lastCheckpoint = 0;
urlsError = new HashSet();
exitLock = new Object();
}
private void printMemory(String para)
{
System.out.println(para + "--> 内存数: " + Runtime.getRuntime().freeMemory());
}
public void start()
{
quit = false;
running = 0;
for(int i = 0; i < config.getSpiderThreads(); i++)
{
_logClass.info("Starting Spider thread");
Thread t = new Thread(this, "Spider-Thread-" + (i + 1));
t.start();
running++;
}
}
public void stop()
{
quit = true;
}
public boolean isRunning()
{
return running == 0;
}
public int getRunning()
{
return running;
}
private void checkpointIfNeeded()
{
if(config.getCheckpointInterval() == 0)
{
return;
}
if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
{
synchronized(queue)
{
if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
{
writeCheckpoint();
lastCheckpoint = System.currentTimeMillis();
}
}
}
}
private void writeCheckpoint()
{
//_logClass.debug("writeCheckpoint()");
try
{
FileOutputStream fos = new FileOutputStream("spider.checkpoint", false);
ObjectOutputStream oos = new ObjectOutputStream(fos);
oos.writeObject(queue);
oos.writeObject(urlsDownloading);
oos.close();
}
catch(IOException ioe)
{
// _logClass.warn("IO Exception attempting checkpoint: " + ioe.getMessage(), ioe);
}
}
public void readCheckpoint()
{
try
{
FileInputStream fis = new FileInputStream("spider.checkpoint");
ObjectInputStream ois = new ObjectInputStream(fis);
queue = (DownloadQueue) ois.readObject();
urlsDownloading = (Set) ois.readObject();
queue.queueURLs(urlsDownloading);
urlsDownloading.clear();
}
catch(Exception e)
{
// _logClass.error("Caught exception reading checkpoint: " + e.getMessage(), e);
}
}
/**
* 每个线程的执行函数
*/
public void run()
{
//final int _MAX_TRY_COUNT = 8;
final int _MAX_TRY_COUNT = 3;
int queueEmptyTryCount = 0;
Date spiderStartTime = new Date(); // 下载线程开始时间
int spiderUrlCount = 0; // 下载页面个数
HTMLParser htmlParser = new HTMLParser(config);
URLGetter urlGetter = new URLGetter(config);
// while((queueSize() > 0 || downloadsInProgress > 0) && quit == false && queueEmptyTryCount <= _MAX_TRY_COUNT)
while(queueEmptyTryCount <= _MAX_TRY_COUNT)
{
// 保存当前下载堆栈状态
checkpointIfNeeded();
// 对下载时间和下载页面数作限制
Date spiderCurrTime = new Date();
if (spiderCurrTime.getTime() - spiderStartTime.getTime() > config.getMaxDownloadTime() * 1000) {
_logClass.warn("time expired ...");
break;
}
if (++spiderUrlCount > config.getMaxDownloadPages()) {
_logClass.warn("fetched urls exceeded ...");
break;
}
// 判断下载任务是否全部完毕
if(queueSize() == 0 && downloadsInProgress > 0)
{
// Wait for a download to finish before seeing if this thread should stop
try {
Thread.sleep(QUEUE_CHECK_INTERVAL);
}
catch(InterruptedException ignored) {
}
// Have another go at the loop
continue;
}
else if(queueSize() == 0)
{
// 防止线程在开始下载时即退出
synchronized(exitLock) {
queueEmptyTryCount++;
}
if (queueEmptyTryCount > _MAX_TRY_COUNT) {
break;
}
else {
try {
_logClass.info("sleep here ...");
Thread.sleep(QUEUE_CHECK_INTERVAL);
}
catch(InterruptedException ignored) {
}
continue;
}
}
// 取得待下载网页URL
URLToDownload nextURL;
synchronized(queue)
{
nextURL = queue.getNextInQueue();
if (nextURL == null) {
if (queueEmptyTryCount > _MAX_TRY_COUNT) {
break;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -