📄 spiderconfig.java
字号:
/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
// $Header: /CVSRepository/spider/cn/yicha/subject/spider/SpiderConfig.java,v 1.1 2005/12/01 02:10:19 zhangdi Exp $
package cn.yicha.subject.spider;
import java.io.*;
import java.util.*;
import java.net.URL;
import java.net.MalformedURLException;
import cn.yicha.common.util.Logger;
import cn.yicha.subject.spider.wapsite.*;
public class SpiderConfig extends Logger implements Serializable
{
private final static int _DEFAULT_MAX_DOWNLOAD_TIME = 5 * 3600; // 默认最大下载时间
private final static int _DEFAULT_MAX_DOWNLOAD_PAGES = 20000; // 默认最大下载页面数
private final static int _DEFAULT_MAX_TRY_COUNT = 1; // 默认最大下载次数
private File saveRootDirectory;
private File mailtoLogFile;
private boolean refreshHTMLs;
private boolean refreshImages;
private boolean refreshOthers;
private Set htmlExtensions;
private Set imageExtensions;
private Set ringExtensions;
private Set gameExtensions;
private Set invalidExtensions; // 无效链接后缀,例如图片和铃声不予下载处理
private URL startLocation;
private String urlMatch;
private List interestingURLSubstrings;
private List boringURLSubstrings;
private boolean depthFirst;
private int maxDepth;
private String userAgent;
private String basicAuthUser;
private String basicAuthPassword;
private String websiteFile; // 网站配置文件
private HashSet _websiteHash = null; // 下载网站列表
private File saveAnchorDirectory; // 锚点属性存储路径
private boolean downloadMonternet; // 下载免费站还是梦网站的标记
private String saveRingFile; // 铃声数据存储文件
private String saveGameFile; // 游戏数据存储文件
private boolean filterBySecondDomain; // 网站是否按二级域名过滤
private boolean overrideLog; // 日志覆盖标志
private Set mobileNos; // 模拟手机号
private int maxTryCount; // 最大尝试连接次数
private int maxDownloadTime; // 下载时间限制,以秒为单位
private int maxDownloadPages; // 下载页面个数限制
private String proxyHost; // 代理服务器地址
private String proxyPort; // 代理服务器端口
private int spiderThreads;
private long checkpointInterval;
private int _mobile_no_index = 0; // 手机号索引
private HashSet _hsPreLinks = null;
/**
* Create a default config.
*/
public SpiderConfig()
{
// _logClass.debug("SpiderConfig()");
saveRootDirectory = new File(".");
saveAnchorDirectory = new File(".");
mailtoLogFile = new File("mailto.txt");
refreshHTMLs = true;
refreshImages = false;
refreshOthers = false;
filterBySecondDomain = true;
downloadMonternet = false;
overrideLog = true;
htmlExtensions = new HashSet();
htmlExtensions.add("htm");
htmlExtensions.add("html");
htmlExtensions.add("shtml");
imageExtensions = new HashSet();
imageExtensions.add("jpg");
imageExtensions.add("gif");
imageExtensions.add("png");
invalidExtensions = new HashSet();
invalidExtensions.add("jpg");
invalidExtensions.add("gif");
invalidExtensions.add("png");
ringExtensions = new HashSet();
ringExtensions.add("amr");
ringExtensions.add("mid");
gameExtensions = new HashSet();
gameExtensions.add("jar");
gameExtensions.add("jad");
gameExtensions.add("sis");
urlMatch = null;
interestingURLSubstrings = new ArrayList();
boringURLSubstrings = new ArrayList();
depthFirst = false;
maxDepth = 0;
//userAgent ="MOT-E680/R51_G_0F.42.A1P MIB/2.2 Profile/MIDP-2.0 Configuration/CLDC-1.1";
userAgent = "WebLech Spider 0.01alpha";
// userAgent = "Nokia6108/1.0 (05.04) Profile/MIDP-1.0 Configuration/CLDC-1.0";
basicAuthUser = "";
basicAuthPassword = "";
spiderThreads = 1;
checkpointInterval = 0;
}
/**
* Create a config from a java.util.Properties object.
*/
public SpiderConfig(Properties props)
{
_logClass.info("start to config spider");
_logClass.debug("SpiderConfig(props)");
// 创建下载文件存储目录,
saveRootDirectory = new File(props.getProperty("saveRootDirectory", "."));
if(!saveRootDirectory.exists())
{
if(!saveRootDirectory.mkdirs())
{
_logClass.error("Couldn't create root directory: " + saveRootDirectory);
_logClass.info("Defaulting to . instead");
saveRootDirectory = new File(".");
}
}
else if(!saveRootDirectory.isDirectory())
{
_logClass.error("Save root is not a directory: " + saveRootDirectory);
_logClass.info("Defaulting to . instead");
saveRootDirectory = new File(".");
}
// 创建锚点属性存储目录
saveAnchorDirectory = new File(props.getProperty("saveAnchorDirectory", "."));
if(!saveAnchorDirectory.exists())
{
if(!saveAnchorDirectory.mkdirs())
{
_logClass.error("Couldn't create anchor directory: " + saveAnchorDirectory);
_logClass.info("Defaulting to . instead");
saveAnchorDirectory = new File(".");
}
}
else if(!saveAnchorDirectory.isDirectory())
{
_logClass.error("Save anchor is not a directory: " + saveAnchorDirectory);
_logClass.info("Defaulting to . instead");
saveAnchorDirectory = new File(".");
}
String mailtoFileStr = props.getProperty("mailtoLogFile", "mailto.txt");
// Check if absolute or relative name given
if(mailtoFileStr.indexOf(":") != -1 || mailtoFileStr.startsWith("/") || mailtoFileStr.startsWith("\\"))
{
_logClass.debug("Using absolute file name " + mailtoFileStr);
mailtoLogFile = new File(mailtoFileStr);
}
else
{
_logClass.debug("Constructing relative file name " + saveRootDirectory.getPath() + "/" + mailtoFileStr);
mailtoLogFile = new File(saveRootDirectory.getPath() + "/" + mailtoFileStr);
}
refreshHTMLs = Boolean.valueOf(props.getProperty("refreshHTMLs", "true")).booleanValue();
refreshImages = Boolean.valueOf(props.getProperty("refreshImages", "false")).booleanValue();
refreshOthers = Boolean.valueOf(props.getProperty("refreshOthers", "false")).booleanValue();
// 下载梦网或是免费网站标记
downloadMonternet = Boolean.valueOf(props.getProperty("downloadMonternet", "false")).booleanValue();
// 系统是按二级域名过滤还是三级域名过滤标志
filterBySecondDomain = Boolean.valueOf(props.getProperty("filterBySecondDomain", "true")).booleanValue();
// 日志覆盖标志
overrideLog = Boolean.valueOf(props.getProperty("overrideLog", "true")).booleanValue();
htmlExtensions = parseSet(props.getProperty("htmlExtensions", "htm,html,shtml"));
imageExtensions = parseSet(props.getProperty("imageExtensions", "jpg,gif,png"));
invalidExtensions = parseSet(props.getProperty("invalidExtensions", "jpg,gif,png"));
ringExtensions = parseSet(props.getProperty("ringExtensions", "mid,amr"));
gameExtensions = parseSet(props.getProperty("gameExtensions", "jar,jad,sis"));
// 采集手机号
mobileNos = parseSet(props.getProperty("mobileNos", "13968873803,13950428251"));
String startLocStr = props.getProperty("startLocation");
if(startLocStr != null)
{
try {
startLocation = new URL(startLocStr);
}
catch(MalformedURLException murle) {
_logClass.error("Caught MalformedURLException parsing start URL '" + startLocStr + "' : " + murle.getMessage(), murle);
}
}
else
{
_logClass.warn("startLocation not found in properties");
}
urlMatch = props.getProperty("urlMatch");
interestingURLSubstrings = parsePropCommaSeparated(props.getProperty("interestingURLs"));
boringURLSubstrings = parsePropCommaSeparated(props.getProperty("boringURLs"));
depthFirst = Boolean.valueOf(props.getProperty("depthFirst", "false")).booleanValue();
try {
String maxDepthStr = props.getProperty("maxDepth", "0");
maxDepth = Integer.parseInt(maxDepthStr);
}
catch(NumberFormatException nfe) {
_logClass.error("Caught number format exception parsing max depth, defaulting to 1", nfe);
maxDepth = 1;
}
// 获取最大下载时间
try {
String maxDownloadTimeStr = props.getProperty("maxDownloadTime", String.valueOf(_DEFAULT_MAX_DOWNLOAD_TIME));
maxDownloadTime = Integer.parseInt(maxDownloadTimeStr);
}
catch (NumberFormatException nfe) {
_logClass.error("Caught number format exception parsing max download time, defaulting to " + _DEFAULT_MAX_DOWNLOAD_TIME, nfe);
maxDownloadTime = _DEFAULT_MAX_DOWNLOAD_TIME;
}
// 获取最大下载页面数
try {
String maxDownloadPagesStr = props.getProperty("maxDownloadPages", String.valueOf(_DEFAULT_MAX_DOWNLOAD_PAGES));
maxDownloadPages = Integer.parseInt(maxDownloadPagesStr);
}
catch (NumberFormatException nfe) {
_logClass.error("Caught number format exception parsing max download pages, defaulting to " + _DEFAULT_MAX_DOWNLOAD_PAGES, nfe);
maxDownloadPages = _DEFAULT_MAX_DOWNLOAD_PAGES;
}
// 获取最大尝试下载次数
try {
String maxTryCountStr = props.getProperty("maxTryCount", String.valueOf(_DEFAULT_MAX_TRY_COUNT));
maxTryCount = Integer.parseInt(maxTryCountStr);
}
catch (NumberFormatException nfe) {
_logClass.error("Caught number format exception parsing max download try count, defaulting to " + _DEFAULT_MAX_TRY_COUNT, nfe);
maxTryCount = _DEFAULT_MAX_TRY_COUNT;
}
// 获取代理服务器地址和端口
proxyHost = props.getProperty("proxyHost", "");
proxyPort = props.getProperty("proxyPort", "");
userAgent = props.getProperty("userAgent", "WebLech Spider 0.01alpha");
basicAuthUser = props.getProperty("basicAuthUser", "");
basicAuthPassword = props.getProperty("basicAuthPassword", "");
websiteFile = props.getProperty("websiteFile", "config/website.conf");
saveRingFile = props.getProperty("saveRingFile", "logs/ringlog.txt");
saveGameFile = props.getProperty("saveGameFile", "logs/gamelog.txt");
try
{
String threadsStr = props.getProperty("spiderThreads", "1");
spiderThreads = Integer.parseInt(threadsStr);
}
catch(NumberFormatException nfe)
{
_logClass.error("Caught number format exception parsing number of threads, defaulting to 1", nfe);
spiderThreads = 1;
}
try
{
String intervalStr = props.getProperty("checkpointInterval", "0");
checkpointInterval = Long.parseLong(intervalStr);
}
catch(NumberFormatException nfe)
{
_logClass.error("Caught number format exception parsing checkpoint interval, defaulting to 0", nfe);
checkpointInterval = 0;
}
}
private List parsePropCommaSeparated(String str)
{
ArrayList result = new ArrayList();
if(str != null && str.length() > 0)
{
StringTokenizer tok = new StringTokenizer(str, ",");
while(tok.hasMoreTokens())
{
result.add(tok.nextToken());
}
}
return result;
}
/**
* 解析待下载的网站列表
*/
public boolean parseWebsites(String websiteFile)
{
_logClass.info("enter parseWebsites");
// 装载网站属性
try {
FileReader fr = new FileReader(websiteFile);
BufferedReader br = new BufferedReader(fr);
_websiteHash = new HashSet();
// 读取网站列表
String line = br.readLine();
while (line != null) {
String[] substrs = line.split("\\s+");
if (substrs.length <= 0) {
return false;
}
_logClass.info("web name: " + substrs[0]);
// 设置网站地址和过滤模式
SiteConfig sc = new SiteConfig();
if (substrs.length >= 1) {
sc.setSiteUrl(substrs[0]);
}
if (substrs.length >= 2) {
sc.setFilterPattern(substrs[1]);
}
_websiteHash.add(sc);
line = br.readLine();
}
br.close();
fr.close();
}
catch(Exception e) {
return false;
}
return true;
}
public SiteConfig[] getWebsites()
{
return (SiteConfig[]) _websiteHash.toArray(new SiteConfig[0]);
}
/**
* 随机取得手机号
*/
public synchronized String getRandomMobileNo() {
String[] mobileNoArray = (String[]) mobileNos.toArray(new String[0]);
_mobile_no_index = (_mobile_no_index + 1) % mobileNoArray.length;
return (mobileNoArray[_mobile_no_index]);
}
public void setRefreshHTMLs(boolean refreshHTMLs)
{
this.refreshHTMLs = refreshHTMLs;
}
public boolean refreshHTMLs()
{
return refreshHTMLs;
}
public void setRefreshImages(boolean refreshImages)
{
this.refreshImages = refreshImages;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -