⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 filespider.java

📁 是个java写的sipder,非常不错!能承受很大的压力,每天采集的数量在10000万
💻 JAVA
字号:
package cn.yicha.subject.spider.ui;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Properties;

import org.apache.log4j.Category;

import cn.yicha.common.util.Log4j;
import cn.yicha.subject.spider.SiteConfig;
import cn.yicha.subject.spider.Spider;
import cn.yicha.subject.spider.SpiderConfig;
import cn.yicha.subject.spider.store.*;
import cn.yicha.subject.spider.wapsite.GameSite;
import cn.yicha.subject.spider.wapsite.GameSiteFetcher;
import cn.yicha.subject.spider.writer.NonPageLinkLog;


public class FileSpider {
	static {
		Log4j.init();//?
	}
	private static Category _logClass = Category.getInstance(FileSpider.class);
	
	protected static final String _SPIDER_PROP_FILE = "config/spider.properties";
	
	// paras
	private static String _kind = "game";
	private static String _site_config = "config/wap_game_sites.conf";
	private static String _siteTags = "1,2,3";
	private static boolean _resume = true;
	private static String _logToResume = "log/nonpagelinks_2005-11-12-16-10-09_site-1,2,3.log"; 
	
	private static Hashtable _prevLinkht;
	
	private static void StartSpider(Properties props) {
		
		_logClass.debug("Configuring Spider from properties");
		
		// 初始化站点列表
		GameSiteFetcher gsf = new GameSiteFetcher(_site_config, _siteTags);
		GameSite[] gs = gsf.get();
		
		// 读取属性
		SpiderConfig config = new SpiderConfig(props);
		if (!config.setWebsites(gs)) {
			_logClass.error("读取网站配置文件错误");
			System.exit(1);
		}

		// 如果需要恢复,读取以往记录,并存放到spider config中
		if (_resume) {
			HeaderContent[] prevHc = NonPageLinkLog.praseHeaderLog(_logToResume);
			HashSet hsPreLinks = new HashSet();
			for (int i = 0; i < prevHc.length; i ++) {
				hsPreLinks.add(prevHc[i].get_uri());
			}
			
			config.setHsPreLinks(hsPreLinks);
			
			// 把prevHc建成hash表
			_prevLinkht = new Hashtable();
			for (int i = 0; i < prevHc.length; i ++) {
				_prevLinkht.put(prevHc[i].get_uri(), prevHc[i]);
			}
		}
		
		// 开始网页抓取
		_logClass.debug(config);
		NonPageLinkLog.init(_kind, _siteTags);
		Spider spider = new Spider(config);

		_logClass.info("Starting Spider...");
		spider.start();

		// 按键停止抓取进程
		System.out.println("\nHit any key to stop Spider\n");
		try {
			while (spider.isRunning()) {
				if (System.in.available() != 0) {
					System.out.println("\nStopping Spider...\n");
					spider.stop();
					break;
				}
				// pause(SPIDER_STOP_PAUSE);
			}
		} catch (IOException ioe) {
			_logClass.error("Unexpected exception caught: " + ioe.getMessage(),
					ioe);
			System.exit(1);
		}
	}

	protected static void usage() {
		System.out.println("usage:");
		System.out.println("FileSpider kind site_config site_list");
		System.out.println("FileSpider kind site_config site_list previous_log");
	}
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		if (args.length == 3) {
			_resume = false;
			_kind = args[0];
			_site_config = args[1];
			_siteTags = args[2];
			_logToResume = "";
		} else if (args.length == 4) {
			_resume = true;
			_kind = args[0];
			_site_config = args[1];
			_siteTags = args[2];
			_logToResume = args[3];
		} else {
			usage();
			System.exit(0);
		}

		// 设置系统属性变量
		System.setProperty("sun.net.client.defaultConnectTimeout", "60000");
		System.setProperty("sun.net.client.defaultReadTimeout", "60000");

		_logClass.debug("main()");

		// 读取配置文件
		Properties props = null;
		try {
			FileInputStream propsIn = new FileInputStream(_SPIDER_PROP_FILE);
			props = new Properties();
			props.load(propsIn);
			propsIn.close();
		} catch (FileNotFoundException fnfe) {
			_logClass.error("File not found: " + args[0], fnfe);
			System.exit(1);
		} catch (IOException ioe) {
			_logClass.error("IO Exception caught reading config file: "
					+ ioe.getMessage(), ioe);
			System.exit(1);
		}
		
		StartSpider(props);

	}

	public static boolean is_resume() {
		return _resume;
	}

	public static void set_resume(boolean _resume) {
		FileSpider._resume = _resume;
	}

	public static Hashtable get_prevLinkht() {
		return _prevLinkht;
	}

	public static void set_prevLinkht(Hashtable linkht) {
		_prevLinkht = linkht;
	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -