📄 htmlparser.java
字号:
/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
// $Header: /CVSRepository/spider/cn/yicha/subject/spider/extractor/HTMLParser.java,v 1.2 2005/12/01 02:10:21 zhangdi Exp $
package cn.yicha.subject.spider.extractor;
import org.apache.log4j.Category;
import cn.yicha.common.util.Log4j;
import cn.yicha.common.util.StringUtil;
import cn.yicha.subject.spider.SpiderConfig;
import cn.yicha.subject.spider.URLObject;
import cn.yicha.subject.spider.URLToDownload;
import cn.yicha.subject.spider.store.ExtractAnchor;
import java.util.List;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import java.net.URL;
import java.net.MalformedURLException;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.lang.Runtime;
import java.lang.System;
public class HTMLParser
{
private final static Category _logClass = Category.getInstance(URLObject.class);
private SpiderConfig config;
static
{
Log4j.init();
}
public HTMLParser(SpiderConfig config)
{
this.config = config;
}
public List parseLinksInDocument(URLToDownload sourceURL, String textContent)
{
return parseAsHTML(sourceURL, textContent);
}
/**
* 解析页面,提取出所有链接
*/
private List parseAsHTML(URLToDownload sourceURL, String textContent)
{
//System.out.println("parseAsHTML: sourceURL: " + sourceURL.getEndURL().toExternalForm());
ArrayList newURLs = new ArrayList();
HashSet newURLSet = new HashSet();
extractAnchorsFromTags(sourceURL, newURLs, newURLSet, textContent);
extractGoHrefs(sourceURL, newURLs, newURLSet, textContent);
extractRedirectsFromTags(sourceURL, newURLs, newURLSet, textContent);
extractOptionAnchors(sourceURL, newURLs, newURLSet, textContent);
if(newURLs.size() == 0)
{
// _logClass.debug("Got 0 new URLs from HTML parse, check HTML\n" + textContent);
}
_logClass.info("Returning " + newURLs.size() + " urls extracted from page " + sourceURL.getURL().toExternalForm());
for (int i=0; i < newURLs.size(); i++) {
//System.out.println("newURLs.get(i)" + newURLs.get(i));
URLToDownload u = (URLToDownload) newURLs.get(i);
_logClass.debug("extract url --> " + u.getURL().toExternalForm());
}
return newURLs;
}
/**
* 提取出链接标签属性值
*/
private void extractAnchorsFromTags(URLToDownload sourceURL, List newURLs, Set newURLSet, String input)
{
// System.out.println("enter extractAnchorsFromTags");
// 取出所有链接标签属性值
String[] attrList = ExtractAnchor.extractAnchorsFromTag(input);
for (int i=0; i < attrList.length; i++)
{
//System.out.println("attrList[i]: " + i + attrList[i]);
String attrValue = StringUtil.replace(attrList[i], "&", "&");
//System.out.println("attrValue: " + i + attrValue);
//System.out.println("sourceURL: " + sourceURL.getEndURL().toExternalForm());
addAnchorLink(sourceURL, newURLs, newURLSet, attrValue);
}
}
/**
* 提取出重定向标签属性值
*/
private void extractRedirectsFromTags(URLToDownload sourceURL, List newURLs, Set newURLSet, String input)
{
// 取出所有重定向标签属性值
String[] attrList = ExtractAnchor.extractRedirectsFromTag(input);
for (int i=0; i < attrList.length; i++)
{
String attrValue = StringUtil.replace(attrList[i], "&", "&");
_logClass.info("extract redirect tag --> " + attrValue);
addAnchorLink(sourceURL, newURLs, newURLSet, attrValue);
}
}
/**
* 提取选择控件中的链接属性
*/
private void extractOptionAnchors(URLToDownload sourceURL, List newURLs, Set newURLSet, String input)
{
// 取出选择控件中的链接
String[] attrList = ExtractAnchor.extractOptionsFromTag(input);
for (int i=0; i < attrList.length; i++)
{
String attrValue = StringUtil.replace(attrList[i], "&", "&");
_logClass.info("extract option tag --> " + attrValue);
addAnchorLink(sourceURL, newURLs, newURLSet, attrValue);
}
}
/**
* 提取出Go标签内的所有链接
* 本函数不必替换"&",因为URL是程序定制组装的
*/
private void extractGoHrefs(URLToDownload sourceURL, List newURLs, Set newURLSet, String input)
{
// 取出所有Go链接
String[] hrefList = ExtractAnchor.fetchGoHrefs(input);
for (int i=0; i < hrefList.length; i++)
{
String hrefLink = StringUtil.replace(hrefList[i], "&", "&");
// _logClass.info("extract go href --> " + hrefLink);
addAnchorLink(sourceURL, newURLs, newURLSet, hrefLink);
}
// 取出所有形如<go.../>的链接
String[] hrefTagGoList = ExtractAnchor.fetchTagGoHrefs(input);
for (int i=0; i < hrefTagGoList.length; i++)
{
String hrefLink = StringUtil.replace(hrefTagGoList[i], "&", "&");
_logClass.info("extract pure go href --> " + hrefLink);
addAnchorLink(sourceURL, newURLs, newURLSet, hrefLink);
}
}
/**
* 添加本页链接
*/
private void addAnchorLink(URLToDownload sourceURL, List newURLs, Set newURLSet, String link)
{
try
{
// 添加绝对链接
URL u = new URL(sourceURL.getEndURL(), link);
if(!newURLSet.contains(u.toExternalForm()))
{
// 根据新链接,决定新的URL是否需要
URLToDownload newUrl = new URLToDownload(u, 0, sourceURL.getServiceID());
if (newUrl.isValidAnchor(config.getRingExtensions(), config.getGameExtensions(), config.getInvalidExtensions()))
{
newURLs.add(newUrl);
newURLSet.add(u.toExternalForm());
}
}
}
catch(MalformedURLException murle)
{
}
}
private void logMailURL(String url)
{
// _logClass.debug("logMailURL()");
try
{
FileWriter appendedFile = new FileWriter(config.getMailtoLogFile().toString(), true);
PrintWriter pW = new PrintWriter(appendedFile);
pW.println(url);
pW.flush();
pW.close();
}
catch(IOException ioe)
{
// _logClass.warn("Caught IO exception writing mailto URL:" + ioe.getMessage(), ioe);
}
}
/**
* Check if a particular URL looks like it's a mailto: style link.
*/
private boolean isMailTo(String url)
{
if(url == null)
{
return false;
}
url = url.toUpperCase();
return (url.indexOf("MAILTO:") != -1);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -