📄 filespider.java
字号:
package cn.yicha.subject.spider.ui;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Properties;
import org.apache.log4j.Category;
import cn.yicha.common.util.Log4j;
import cn.yicha.subject.spider.SiteConfig;
import cn.yicha.subject.spider.Spider;
import cn.yicha.subject.spider.SpiderConfig;
import cn.yicha.subject.spider.store.*;
import cn.yicha.subject.spider.wapsite.GameSite;
import cn.yicha.subject.spider.wapsite.GameSiteFetcher;
import cn.yicha.subject.spider.writer.NonPageLinkLog;
public class FileSpider {
static {
Log4j.init();//?
}
private static Category _logClass = Category.getInstance(FileSpider.class);
protected static final String _SPIDER_PROP_FILE = "config/spider.properties";
// paras
private static String _kind = "game";
private static String _site_config = "config/wap_game_sites.conf";
private static String _siteTags = "1,2,3";
private static boolean _resume = true;
private static String _logToResume = "log/nonpagelinks_2005-11-12-16-10-09_site-1,2,3.log";
private static Hashtable _prevLinkht;
private static void StartSpider(Properties props) {
_logClass.debug("Configuring Spider from properties");
// 初始化站点列表
GameSiteFetcher gsf = new GameSiteFetcher(_site_config, _siteTags);
GameSite[] gs = gsf.get();
// 读取属性
SpiderConfig config = new SpiderConfig(props);
if (!config.setWebsites(gs)) {
_logClass.error("读取网站配置文件错误");
System.exit(1);
}
// 如果需要恢复,读取以往记录,并存放到spider config中
if (_resume) {
HeaderContent[] prevHc = NonPageLinkLog.praseHeaderLog(_logToResume);
HashSet hsPreLinks = new HashSet();
for (int i = 0; i < prevHc.length; i ++) {
hsPreLinks.add(prevHc[i].get_uri());
}
config.setHsPreLinks(hsPreLinks);
// 把prevHc建成hash表
_prevLinkht = new Hashtable();
for (int i = 0; i < prevHc.length; i ++) {
_prevLinkht.put(prevHc[i].get_uri(), prevHc[i]);
}
}
// 开始网页抓取
_logClass.debug(config);
NonPageLinkLog.init(_kind, _siteTags);
Spider spider = new Spider(config);
_logClass.info("Starting Spider...");
spider.start();
// 按键停止抓取进程
System.out.println("\nHit any key to stop Spider\n");
try {
while (spider.isRunning()) {
if (System.in.available() != 0) {
System.out.println("\nStopping Spider...\n");
spider.stop();
break;
}
// pause(SPIDER_STOP_PAUSE);
}
} catch (IOException ioe) {
_logClass.error("Unexpected exception caught: " + ioe.getMessage(),
ioe);
System.exit(1);
}
}
protected static void usage() {
System.out.println("usage:");
System.out.println("FileSpider kind site_config site_list");
System.out.println("FileSpider kind site_config site_list previous_log");
}
/**
* @param args
*/
public static void main(String[] args) {
if (args.length == 3) {
_resume = false;
_kind = args[0];
_site_config = args[1];
_siteTags = args[2];
_logToResume = "";
} else if (args.length == 4) {
_resume = true;
_kind = args[0];
_site_config = args[1];
_siteTags = args[2];
_logToResume = args[3];
} else {
usage();
System.exit(0);
}
// 设置系统属性变量
System.setProperty("sun.net.client.defaultConnectTimeout", "60000");
System.setProperty("sun.net.client.defaultReadTimeout", "60000");
_logClass.debug("main()");
// 读取配置文件
Properties props = null;
try {
FileInputStream propsIn = new FileInputStream(_SPIDER_PROP_FILE);
props = new Properties();
props.load(propsIn);
propsIn.close();
} catch (FileNotFoundException fnfe) {
_logClass.error("File not found: " + args[0], fnfe);
System.exit(1);
} catch (IOException ioe) {
_logClass.error("IO Exception caught reading config file: "
+ ioe.getMessage(), ioe);
System.exit(1);
}
StartSpider(props);
}
public static boolean is_resume() {
return _resume;
}
public static void set_resume(boolean _resume) {
FileSpider._resume = _resume;
}
public static Hashtable get_prevLinkht() {
return _prevLinkht;
}
public static void set_prevLinkht(Hashtable linkht) {
_prevLinkht = linkht;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -