📄 spider.java
字号:
}
else {
continue;
}
}
downloadsInProgress++;
}
// 如果下载网页已发生错误,不再下载
synchronized(urlsError)
{
if (urlsError.contains(nextURL.getURL().getHost())) {
_logClass.info("error url --> " + nextURL);
downloadsInProgress--;
continue;
}
}
// 更新当前下载网页队列
synchronized(urlsDownloading)
{
urlsDownloading.add(nextURL);
}
int newDepth = nextURL.getDepth() + 1;
int maxDepth = config.getMaxDepth();
synchronized(urlsDownloading)
{
urlsDownloading.remove(nextURL);
}
// 获取需下载的新URL数据,设置serviceID,并对其过滤
List newURLs = downloadURL(nextURL, urlGetter, htmlParser, newDepth);
setServiceID(newURLs, nextURL);
newURLs = filterURLs(newURLs, nextURL);
// 向待下载队列添加本次分析网页得到的所有新链接
ArrayList u2dsToQueue = new ArrayList();
for(Iterator i = newURLs.iterator(); i.hasNext(); )
{
URLToDownload u = (URLToDownload) i.next();
// Download if not yet downloaded, and the new depth is less than the maximum
synchronized(urlsDownloadedOrScheduled)
{
if(!urlsDownloadedOrScheduled.contains(u.getURL().toExternalForm())
&& (maxDepth == 0 || newDepth <= maxDepth))
{
u2dsToQueue.add(new URLToDownload(u.getURL(), nextURL.getURL(), newDepth, u.getServiceID(), u.getFilterPattern(), u.getIsBeforeSubs()));
urlsDownloadedOrScheduled.add(u.getURL().toExternalForm());
}
}
}
nextURL = null;
newURLs = null;
synchronized(queue)
{
queue.queueURLs(u2dsToQueue);
downloadsInProgress--;
}
}
_logClass.info("Spider thread stopping");
NonPageLinkLog.tagFinish();
running--;
}
/**
* Get the size of the download queue in a thread-safe manner.
*/
private int queueSize()
{
synchronized(queue)
{
return queue.size();
}
}
/**
* Get a URL, and return new URLs that are referenced from it.
*
* @return A List of URL objects.
*/
private List downloadURL(URLToDownload url, URLGetter urlGetter, HTMLParser htmlParser, int depth)
{
_logClass.info("downloadURL(" + url + ")");
ArrayList urlList = new ArrayList();
// 如果已经下载到本地,根据配置标志判断是否需要更新数据
URLObject obj = new URLObject(url.getURL(), config, url.getServiceID(), url.getIsBeforeSubs());
if(obj.existsOnDisk()) {
if(config.refreshHTMLs() && obj.isValidDoc() ) {
obj = urlGetter.getURL(url);
}
} else if (FileSpider.is_resume() && obj.existsOnPrevLog()) {// 链接存在于先前的log文件中
if (!config.refreshHTMLs()) {
NonPageLinkLog.add((HeaderContent) FileSpider.get_prevLinkht().get(url.getURL().toExternalForm()));
obj.setErrorType(URLObject._OTHER_EXCEPTION);
}
} else {
obj = urlGetter.getURL(url);
}
// 如果未下载成功,向错误页面集合中添加当前页面
if(!obj.isValidObj()) {
// System.out.println("invalid url: " + url.getURL().toExternalForm());
StoreDiffUrl.saveExceptionUrl(obj.getSourceURL().toExternalForm());
if (obj.getErrorType() == URLObject._CONNECT_TIMEOUT_EXCEPTION) {
synchronized(urlsError)
{
_logClass.info("add error host --> " + url.getURL().getHost());
// urlsError.add(url.getURL().getHost());
}
}
return urlList;
}
// 只处理有效网页
if (obj.isValidDoc())
{
// 把下载内容存储到本地文件
if(config.refreshHTMLs() || !obj.existsOnDisk())
{
obj.writeToFile();
}
else
{
//_logClass.info(obj.getSourceURL().toExternalForm() + " has existed in disk!");
url.setEndURL(url.getURL());
}
//System.out.println("START TO PRASER ...");
// 分析网页中的链接
return htmlParser.parseLinksInDocument(url, obj.getStringContent());
}
obj = null;
return urlList;
}
/**
* 过滤URL地址列表,并设置定购前后标志
*/
private List filterURLs(List URLs, URLToDownload sourceUrl)
{
ArrayList retVal = new ArrayList();
synchronized(urlsDownloadedOrScheduled)
{
for(Iterator i = URLs.iterator(); i.hasNext(); )
{
URLToDownload downUrl = (URLToDownload) i.next();
URL u = downUrl.getURL();
//System.out.println("url to be filtered: " + u.toString());
// 已经被添加进队列
if(urlsDownloadedOrScheduled.contains(u.toExternalForm())) {
continue;
}
// 过滤非法URL地址
if (downUrl.isValidUrl())
{
retVal.add(downUrl);
}
else {
_logClass.info("invalid url --> " + u.toExternalForm());
}
}
}
return retVal;
}
/**
* 设置新下载页面的ServiceID属性,以便作数据整理
*/
private void setServiceID(List URLs, URLToDownload sourceURL)
{
Iterator it = URLs.iterator();
while (it.hasNext())
{
URLToDownload u2d = (URLToDownload) it.next();
// 根据URL获取serviceID,梦网搜索是SP服务标识,免费搜索是二级域名或三级域名
String serviceID;
if (config.downloadMonternet())
{
serviceID = u2d.fetchServiceIDFromUrl();
if (serviceID == null) {
// 取上一级URL的serviceID
serviceID = sourceURL.getServiceID();
}
}
else
{
if (config.filterBySecondDomain()) {
serviceID = ParseUrl.getDomainName(u2d.getURL());
}
else {
serviceID = u2d.getURL().getHost();
}
}
u2d.setServiceID(serviceID);
u2d.setFilterPattern(sourceURL.getFilterPattern());
}
}
/**
* 测试打印信息
*/
private void printErrorUrls()
{
Iterator it = urlsError.iterator();
while (it.hasNext()) {
String errorUrl = (String) it.next();
_logClass.info("print error url --> " + errorUrl);
}
}
public static void main(String[] args)
{
// 读取配置文件
Properties props = null;
try
{
String propsFile = "config/spider.properties";
FileInputStream propsIn = new FileInputStream(propsFile);
props = new Properties();
props.load(propsIn);
propsIn.close();
}
catch(FileNotFoundException fnfe)
{
_logClass.error("File not found: " + args[0], fnfe);
System.exit(1);
}
catch(IOException ioe)
{
_logClass.error("IO Exception caught reading config file: " + ioe.getMessage(), ioe);
System.exit(1);
}
try {
SpiderConfig config = new SpiderConfig(props);
// 读取URL
String url = "http://waps.cn/site_new.wap?page=161";
URLToDownload urlDown = new URLToDownload(new URL(url), 0, "");
URLGetter urlGetter = new URLGetter(config);
HTMLParser htmlParser = new HTMLParser(config);
URLObject urlObj = urlGetter.getURL(urlDown);
htmlParser.parseLinksInDocument(urlDown, urlObj.getStringContent());
}
catch (Exception ex) {
ex.printStackTrace();
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -