📄 extractcontext.java
字号:
package com.newwatch.tools;
import com.newwatch.toolkit.dal.dataobject.ChannelLinkDO;
import com.newwatch.toolkit.util.StringUtil;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.Html;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.ParagraphTag;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.tags.SelectTag;
import org.htmlparser.tags.Span;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableHeader;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ExtractContext {
protected static final String lineSign = System.getProperty(
"line.separator");
protected static final int lineSign_size = lineSign.length();
/**定义系统上下文**/
public static final ApplicationContext context = new ClassPathXmlApplicationContext(new String[] {
"newwatch/persistence.xml", "newwatch/biz-util.xml",
"newwatch/biz-dao.xml"
});
/**
* @param args
*/
public static void main(String[] args) {
ExtractContextConsole console = new ExtractContextConsole();
ChannelLinkDO c = new ChannelLinkDO();
c.setEncode("gb2312");
c.setLink("http://www.qiche.com.cn/files/200712/12016.shtml");
c.setLinktext("test");
console.makeContext(c);
}
/**
* 收集HTML页面信息
* @param url
* @param urlEncode
*/
public void makeContext(ChannelLinkDO c) {
String metakeywords = "<META content={0} name=keywords>";
String metatitle = "<TITLE>{0}</TITLE>";
String metadesc = "<META content={0} name=description>";
String netshap = "<p> 正文快照: 时间{0}</p> ";
String tempLeate = "<LI class=active><A href=\"{0}\" target=_blank>{1}</A></LI>";
String crop = "<p><A href=\"{0}\" target=_blank>{1}</A></p> ";
try {
String siteUrl = getLinkUrl(c.getLink());
Parser parser = new Parser(c.getLink());
parser.setEncoding(c.getEncode());
for (NodeIterator e = parser.elements(); e.hasMoreNodes();) {
Node node = (Node) e.nextNode();
if (node instanceof Html) {
PageContext context = new PageContext();
context.setNumber(0);
context.setTextBuffer(new StringBuffer());
//抓取出内容
extractHtml(node, context, siteUrl);
StringBuffer testContext = context.getTextBuffer();
String srcfilePath = "D:/kuaiso/site/templeate/context.vm";
String destfilePath = "D:/kuaiso/site/test/test.htm";
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(srcfilePath), "gbk"));
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(destfilePath), "gbk"));
String lineContext = context.getTextBuffer().toString();
String line;
while ((line = reader.readLine()) != null) {
int start = line.indexOf("#context");
if (start >= 0) {
String tempCrop = StringUtil.replace(crop, "{0}",
c.getLink());
tempCrop = StringUtil.replace(tempCrop, "{1}",
" 原文链接: " + c.getLink());
writer.write(tempCrop + lineSign);
writer.write(netshap + lineSign);
writer.write(lineContext + lineSign);
continue;
}
int start1 = line.indexOf("#titledesc");
if (start1 >= 0) {
String tempLine = StringUtil.replace(tempLeate,
"{0}", "test.htm");
tempLine = StringUtil.replace(tempLine, "{1}",
"标题: " + c.getLinktext());
writer.write(tempLine + lineSign);
continue;
}
int start2 = line.indexOf("#metatitle");
if (start2 >= 0) {
metatitle = StringUtil.replace(metatitle, "{0}",
c.getLinktext());
writer.write(metatitle + lineSign);
continue;
}
int start3 = line.indexOf("#metadesc");
if (start3 >= 0) {
metadesc = StringUtil.replace(metadesc, "{0}",
c.getLinktext());
writer.write(metadesc + lineSign);
continue;
}
writer.write(line + lineSign);
}
writer.flush();
writer.close();
reader.close();
}
}
} catch (Exception e) {
System.out.println(e);
}
}
private String getLinkUrl(String link) {
String urlDomaiPattern = "(http://[^/]*?" + "/)(.*?)";
Pattern pattern = Pattern.compile(urlDomaiPattern,
Pattern.CASE_INSENSITIVE + Pattern.DOTALL);
Matcher matcher = pattern.matcher(link);
String url = "";
while (matcher.find()) {
int start = matcher.start(1);
int end = matcher.end(1);
url = link.substring(start, end - 1).trim();
}
return url;
}
/**
* 递归钻取正文信息
* @param nodeP
* @return
*/
protected List extractHtml(Node nodeP, PageContext context, String siteUrl)
throws Exception {
NodeList nodeList = nodeP.getChildren();
boolean bl = false;
if ((nodeList == null) || (nodeList.size() == 0)) {
if (nodeP instanceof ParagraphTag) {
ArrayList tableList = new ArrayList();
StringBuffer temp = new StringBuffer();
temp.append("<p style=\"TEXT-INDENT: 2em\">");
tableList.add(temp);
temp = new StringBuffer();
temp.append("</p>").append(lineSign);
tableList.add(temp);
return tableList;
}
return null;
}
if ((nodeP instanceof TableTag) || (nodeP instanceof Div)) {
bl = true;
}
if (nodeP instanceof ParagraphTag) {
ArrayList tableList = new ArrayList();
StringBuffer temp = new StringBuffer();
temp.append("<p style=\"TEXT-INDENT: 2em\">");
tableList.add(temp);
extractParagraph(nodeP, siteUrl, tableList);
temp = new StringBuffer();
temp.append("</p>").append(lineSign);
tableList.add(temp);
return tableList;
}
ArrayList tableList = new ArrayList();
try {
for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) {
Node node = (Node) e.nextNode();
if (node instanceof LinkTag) {
tableList.add(node);
setLinkImg(node, siteUrl);
} else if (node instanceof ImageTag) {
ImageTag img = (ImageTag) node;
if (img.getImageURL().toLowerCase().indexOf("http://") < 0) {
img.setImageURL(siteUrl + img.getImageURL());
} else {
img.setImageURL(img.getImageURL());
}
tableList.add(node);
} else if (node instanceof ScriptTag ||
node instanceof StyleTag || node instanceof SelectTag) {
} else if (node instanceof TextNode) {
if (node.getText().length() > 0) {
StringBuffer temp = new StringBuffer();
String text = collapse(node.getText()
.replaceAll(" ", "")
.replaceAll(" ", ""));
temp.append(text.trim());
tableList.add(temp);
}
} else {
if (node instanceof TableTag || node instanceof Div) {
TableValid tableValid = new TableValid();
isValidTable(node, tableValid);
if (tableValid.getTrnum() > 2) {
tableList.add(node);
continue;
}
}
List tempList = extractHtml(node, context, siteUrl);
if ((tempList != null) && (tempList.size() > 0)) {
Iterator ti = tempList.iterator();
while (ti.hasNext()) {
tableList.add(ti.next());
}
}
}
}
} catch (Exception e) {
return null;
}
if ((tableList != null) && (tableList.size() > 0)) {
if (bl) {
StringBuffer temp = new StringBuffer();
Iterator ti = tableList.iterator();
int wordSize = 0;
StringBuffer node;
int status = 0;
StringBuffer lineStart = new StringBuffer(
"<p style=\"TEXT-INDENT: 2em\">");
StringBuffer lineEnd = new StringBuffer("</p>" + lineSign);
while (ti.hasNext()) {
Object k = ti.next();
if (k instanceof LinkTag) {
if (status == 0) {
temp.append(lineStart);
status = 1;
}
node = new StringBuffer(((LinkTag) k).toHtml());
temp.append(node);
} else if (k instanceof ImageTag) {
if (status == 0) {
temp.append(lineStart);
status = 1;
}
node = new StringBuffer(((ImageTag) k).toHtml());
temp.append(node);
} else if (k instanceof TableTag) {
if (status == 0) {
temp.append(lineStart);
status = 1;
}
node = new StringBuffer(((TableTag) k).toHtml());
temp.append(node);
} else if (k instanceof Div) {
if (status == 0) {
temp.append(lineStart);
status = 1;
}
node = new StringBuffer(((Div) k).toHtml());
temp.append(node);
} else {
node = (StringBuffer) k;
if (status == 0) {
if (node.indexOf("<p") < 0) {
temp.append(lineStart);
temp.append(node);
wordSize = wordSize + node.length();
status = 1;
} else {
temp.append(node);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -