📄 extractanchor.java
字号:
package cn.yicha.subject.spider.store;
import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.lang.Character;
import org.apache.regexp.*;
import cn.yicha.common.util.*;
public class ExtractAnchor
{
// 已提取锚点的页面集合
private static HashSet urlsFetchedAnchors = new HashSet();
/**
* 从网页中提取所有锚点信息
*/
private static AnchorProperty[] extractAnchors(String content, String url)
{
System.out.println("Enter extractAnchors");
// 根据正则模式,匹配所有锚点信息
String pattern = "<a\\s*href\\s*=\\s*[\"|'](.*?)[\"|']>(.*?)</a>";
return fetchMatchedPatterns(content, pattern, url);
}
/**
* 获取匹配正则模式的所有子串
* @param content 待提炼子串的源内容
* @param pattern 正则模式
* @param level 匹配的子表达式层数
*/
private static AnchorProperty[] fetchMatchedPatterns(String content, String pattern, String absoluteUrl)
{
ArrayList anchorList = new ArrayList();
// 根据正则模式,匹配所有子串
RE patt = StringParser.getPatternObj(pattern);
Reader r = new StringReader(content);
CharacterIterator in = new ReaderCharacterIterator(r);
int end = 0;
while (patt.match(in, end))
{
// 取得锚点链接
int start = patt.getParenStart(1);
end = patt.getParenEnd(1);
if (start < 0 || end < 0) {
break;
}
String anchorHref = in.substring(start, end);
// 取得锚点标题
start = patt.getParenStart(2);
end = patt.getParenEnd(2);
if (start < 0 || end < 0) {
break;
}
String anchorTitle = in.substring(start, end);
// 添加锚点对象
AnchorProperty ap = new AnchorProperty();
ap.setAbsoluteUrl(absoluteUrl);
ap.setRelativeUrl(anchorHref);
ap.setTitle(anchorTitle);
System.out.println(ap.getTitle() + "\n" + ap.getUrl());
String url = ap.getUrl();
synchronized( urlsFetchedAnchors)
{
if (!urlsFetchedAnchors.contains(url)) {
anchorList.add(ap);
urlsFetchedAnchors.add(url);
}
}
}
return (AnchorProperty[])anchorList.toArray(new AnchorProperty[0]);
}
/**
* 返回锚点日志文件存储位置
*/
public static String getSaveDiskPath(String basePath, String serviceID, boolean downloadMonternet)
{
if (downloadMonternet) {
return basePath + "/" + serviceID;
}
else {
return basePath;
}
}
/**
* 把锚点数组存储到本地文件
* @param anchorArray 锚点数组
*/
private synchronized static void saveDisk(AnchorProperty[] anchorArray, String fileName, int depth)
{
try {
// 存储本地文件需要同步
// synchronized(fileName) {
{
File f = new File(fileName);
f.getParentFile().mkdirs();
PrintWriter pw = new PrintWriter(
new OutputStreamWriter( new FileOutputStream(fileName, true) ) );
for (int i=0; i < anchorArray.length; i++) {
pw.println(anchorArray[i].getTitle() + "," + anchorArray[i].getUrl() + "," + depth);
}
pw.close();
}
}
catch (java.io.FileNotFoundException e) {
e.printStackTrace();
}
}
/**
* 提取URL地址的域名部分
*/
private static String extractDomain(String url)
{
final String _HTTP_PREFIX = "http://";
final int _HTTP_PREFIX_LEN = _HTTP_PREFIX.length();
// 去掉开始的HTTP前缀
url = url.toLowerCase();
if (url.indexOf(_HTTP_PREFIX) == 0) {
url = url.substring(_HTTP_PREFIX_LEN);
}
int pos = url.indexOf("/");
if (pos < 0) {
return url;
}
else {
return url.substring(0, pos);
}
}
/**
* 提炼页面内部的锚点数据,包括锚点文本和锚点链接,并存储到文件
*/
public static void fetchAnchors(String content, String url, String savePath, int depth)
{
AnchorProperty[] anchorArray = extractAnchors(content, url);
String fileName = savePath + "/" + extractDomain(url);
saveDisk(anchorArray, fileName, depth);
}
/**
* 提炼页面内部的GO标签链接数据
*/
public static String[] fetchGoHrefs(String content)
{
// 根据正则模式,匹配所有锚点信息
String pattern = "<go.*?href\\s*=\\s*[\"|'](.*?)[\"|'].*?>(.*?)</go>";
return extractMatchedGoHrefs(delPureGo(content), pattern);
}
/**
* 取得单Go标签内容
*/
public static String[] getPureGo(String content)
{
final String _GO_PREFIX = "<go";
ArrayList results = new ArrayList();
String lowerContent = content.toLowerCase();
int pos = lowerContent.indexOf(_GO_PREFIX);
while (pos > 0)
{
String part = "";
// 取一个GO标签
char ch = content.charAt(pos);
while( ch != '>') {
part += ch;
pos++;
ch = content.charAt(pos);;
}
part += ch;
// 如果是单标签,添加到结果集中
if (content.charAt(pos-1) == '/') {
results.add(part);
}
pos = lowerContent.indexOf(_GO_PREFIX, pos);
}
return (String[]) results.toArray(new String[0]);
}
/**
* 删除所有纯GO标签
*/
private static String delPureGo(String content)
{
final String _GO_PREFIX = "<go";
String lowerContent = content.toLowerCase();
StringBuffer results = new StringBuffer();
// 循环寻找GO前缀
int beginPos = 0;
int pos = lowerContent.indexOf(_GO_PREFIX);
while (pos > 0)
{
// 向结果串中添加非纯GO标签部分
results.append(content.substring(beginPos, pos));
beginPos = pos;
// 取标签内容
char ch = content.charAt(pos);
while( ch != '>') {
pos++;
ch = content.charAt(pos);;
}
// 如果是纯GO标签,不添加到结果串中
if (content.charAt(pos-1) == '/') {
beginPos = pos + 1;
}
pos = lowerContent.indexOf(_GO_PREFIX, pos);
}
results.append(content.substring(beginPos));
return results.toString();
}
/**
* 提炼页面内部形如<go.../>标签链接数据
*/
public static String[] fetchTagGoHrefs(String content)
{
ArrayList results = new ArrayList();
String[] goTags = getPureGo(content);
for (int i=0; i < goTags.length; i++)
{
String pattern = "href\\s*=\\s*[\"|'](.*?)[\"|']";
String goHref = StringParser.getMatchedElement(goTags[i], pattern);
results.add(goHref);
}
return (String[]) results.toArray(new String[0]);
}
/**
* 解析GO标签里面的PostField域
*/
private static String parseGoField(String content)
{
String urlPara = "";
String patternPostField1 = "<postfield(.*?)>";
String patternPostField2 = "<postfield(.*?)/>";
String patternFieldName = "name\\s*=\\s*[\"|'](.*?)[\"|']";
String patternFieldValue = "value\\s*=\\s*[\"|'](.*?)[\"|']";
// 组装submit参数串
String[] postFieldList = StringParser.fetchMatchedPatterns(content, patternPostField1);
if (postFieldList.length <= 0) {
postFieldList = StringParser.fetchMatchedPatterns(content, patternPostField2);
}
for (int i=0; i < postFieldList.length; i++)
{
String fieldName = StringParser.getMatchedElement(postFieldList[i], patternFieldName);
String fieldValue = StringParser.getMatchedElement(postFieldList[i], patternFieldValue);
// 不组装带变量的提交链接
if (fieldValue.indexOf("$") >= 0) {
return null;
}
urlPara = urlPara + fieldName + "=" + fieldValue + "&";
}
// 滤去尾部的"&",并在首部添加"?"
if (urlPara.endsWith("&")) {
urlPara = urlPara.substring(0, urlPara.length() - 1);
}
return urlPara;
}
/**
* 根据页面内容提取符合特定模式的锚点属性
*/
private static String[] extractMatchedGoHrefs(String content, String pattern)
{
ArrayList hrefList = new ArrayList();
// 根据正则模式,匹配所有子串
RE patt = StringParser.getPatternObj(pattern);
Reader r = new StringReader(content);
CharacterIterator in = new ReaderCharacterIterator(r);
int end = 0;
while (patt.match(in, end))
{
// 取得锚点链接
int start = patt.getParenStart(1);
end = patt.getParenEnd(1);
if (start < 0 || end < 0) {
break;
}
String goHref = in.substring(start, end);
// 取得锚点内容
start = patt.getParenStart(2);
end = patt.getParenEnd(2);
if (start < 0 || end < 0) {
break;
}
String goContent = in.substring(start, end);
String goField = parseGoField(goContent);
if (goField != null) {
if (goHref.indexOf("?") >= 0) {
hrefList.add(goHref + "&" + goField);
}
else {
hrefList.add(goHref + "?" + goField);
}
}
}
return (String[]) hrefList.toArray(new String[0]);
}
/**
* 从网页内容中提取指定标签属性的所有值
*/
public static String[] extractAttributesFromTag(String tag, String attr, String source)
{
// 根据正则模式,取出所有标签属性
String pattern = "<" + tag + ".*?" + attr + "\\s*=\\s*[\"|'](.*?)[\"|']";
return StringParser.fetchMatchedPatterns(source, pattern);
}
/**
* 从网页内容中提取"a"标签属性的所有值
*/
public static String[] extractAnchorsFromTag(String source)
{
//System.out.println("enter extractAnchorsFromTag");
ArrayList results = new ArrayList();
String[] aTags = getStartTagContents(source, "a");
for (int i=0; i < aTags.length; i++)
{
//System.out.println("aTags" + aTags[i]);
String pattern = "href\\s*=\\s*[\"|'](.*?)[\"|']";
String aHref = StringParser.getMatchedElement(aTags[i], pattern);
results.add(aHref);
}
return (String[]) results.toArray(new String[0]);
}
/**
* 取出所有选择控件链接
*/
public static String[] extractOptionsFromTag(String source)
{
// 根据正则模式,取出所有标签属性
String pattern = "onpick\\s*=\\s*[\"|'](.*?)[\"|']";
return StringParser.fetchMatchedPatterns(source, pattern);
}
/**
* 判断tag出现位置,注:查看a标签时<anchor>不应匹配
*/
private static int getTagPos(String content, String tag, int startPos)
{
final String _TAG_PREFIX = "<" + tag.toLowerCase();
content = content.toLowerCase();
// 取得起始标签位置
int pos = content.indexOf(_TAG_PREFIX, startPos);
while (pos >= 0)
{
int newPos = pos + _TAG_PREFIX.length();
// 判断标签字符串是否与tag完全匹配
char ch = content.charAt(newPos);
if (Character.isSpaceChar(ch)) {
break;
}
pos = content.indexOf(_TAG_PREFIX, newPos);
}
return pos;
}
/**
* 取得tag标签的起始内容
*/
public static String[] getStartTagContents(String content, String tag)
{
ArrayList results = new ArrayList();
int pos = getTagPos(content, tag, 0);
while (pos > 0)
{
String part = "";
// 取一个标签
char ch = content.charAt(pos);
while( ch != '>') {
part += ch;
pos++;
ch = content.charAt(pos);;
}
part += ch;
results.add(part);
pos = getTagPos(content, tag, pos);
}
return (String[]) results.toArray(new String[0]);
}
/**
* 从网页内容中提取"a"标签属性的所有值
*/
public static String[] extractOldAnchorsFromTag(String source)
{
// 根据正则模式,取出所有标签属性
String tag = "a";
String attr = "href";
String pattern = "<" + tag + "\\s*" + attr + "\\s*=\\s*[\"|'](.*?)[\"|']";
return StringParser.fetchMatchedPatterns(source, pattern);
}
/**
* 从网页内容中提取"ontimer、onenterforward、onenterbackward"标签属性的所有值
*/
public static String[] extractRedirectsFromTag(String source)
{
// 根据正则模式,取出所有标签属性
String[] tagArray = {"ontimer", "onenterforward", "onenterbackward"};
ArrayList resultArray = new ArrayList();
for (int i=0; i < tagArray.length; i++)
{
String[] matchPatterns = StringParser.fetchMatchedPatterns(source, getPatternFromRedirectTag(tagArray[i]));
for (int j=0; j < matchPatterns.length; j++) {
resultArray.add(matchPatterns[j]);
}
}
return (String[]) resultArray.toArray(new String[0]);
}
/**
* 根据标签获取串模式
*/
private static String getPatternFromRedirectTag(String tag)
{
return tag + "\\s*=\\s*[\"|'](.*?)[\"|']";
}
/**
* 测试获取网页中所有的标签属性
*/
public static void fetchAttributes(String url)
{
// 取得页面内容
String urlContent = "";
try {
urlContent = URLReader.readUrlContent(url);
}
catch (Exception e) {
e.printStackTrace();
}
String[] attrList = extractAttributesFromTag("a", "href", urlContent);
for (int i=0; i < attrList.length; i++) {
System.out.println(attrList[i]);
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -