📄 commonengine.java
字号:
package com.laozizhu.search.impl;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.laozizhu.search.Engine;
import com.laozizhu.search.Item;
import com.laozizhu.search.ItemBase;
import com.laozizhu.search.util.HTMLDecoder;
import com.laozizhu.search.util.PageService;
/**
* 搜索引擎的基础实现。<br>
* 可以用来调度解析的整个过程。<br>
* 部分细节可以在子类重写一些方法来实现。
*
* @author 老紫竹(laozizhu.com)
*/
public class CommonEngine implements Engine {
/**
* 在得到正文之后,可以进行一些自定义的处理。
*
* @param body 解析后的未经处理的文本。
* @return 处理后的文本
*/
public String afterBody(String body) {
return body;
}
public String afterTitle(String title) {
return title;
}
/**
* 解析URL
*/
public Item parseItem(String url) {
String str = PageService.getPage(url, getCharset());
Item item = new ItemBase();
item.setUrl(url);
Matcher m = getTitlePattern().matcher(str);
if (m.find()) {
item.setTitle(afterTitle(m.group(1).trim()));
}
m = getAuthorPattern().matcher(str);
if (m.find()) {
item.setAuthor(m.group(1).trim());
}
m = getDatetPattern().matcher(str);
if (m.find()) {
int num = m.groupCount();
String date;
if (num > 1) {
StringBuilder b = new StringBuilder();
for (int i = 1; i <= num; i++) {
b.append(m.group(i));
}
date = b.toString();
} else {
date = m.group(1).trim();
}
item.setDatetimeCreate(date);
}
StringBuilder b = new StringBuilder();
Iterator<Pattern> it = getBodyPatternList().iterator();
while (it.hasNext()) {
m = it.next().matcher(str);
while (m.find()) {
b.append(m.group(1).trim());
b.append("<hr class='laozizhu'/>");
}
}
item.setBody(afterBody(b.toString()));
if (isStripHtml()) {
item.setBody(stripHtml(item.getBody()));
}
return item;
}
private String titlePatternString;
private String authorPatternString;
private String datePatternString;
private List<String> bodyPatternStringList;
private Pattern titlePattern;
private Pattern authorPattern;
private Pattern datetPattern;
private List<Pattern> bodyPatternList;
/**
* 得到标题的正则表达式。
*
* @return
*/
protected Pattern getTitlePattern() {
if (titlePattern == null) {
synchronized (this) {
if (titlePattern == null) {
if (getTitlePatternString() == null) {
titlePattern = titlePatternDefault;
} else {
titlePattern = Pattern.compile(getTitlePatternString(), Pattern.DOTALL);
}
}
}
}
return titlePattern;
}
/**
* 获得作者的正则表达式
*
* @return
*/
protected Pattern getAuthorPattern() {
if (authorPattern == null) {
synchronized (this) {
if (authorPattern == null) {
if (getAuthorPatternString() == null) {
authorPattern = authorPatternDefault;
} else {
authorPattern = Pattern.compile(getAuthorPatternString(), Pattern.DOTALL);
}
}
}
}
return authorPattern;
}
/**
* 获得主题的正则表达式列表
*
* @return
*/
protected List<Pattern> getBodyPatternList() {
if (bodyPatternList == null) {
synchronized (this) {
if (bodyPatternList == null) {
bodyPatternList = new LinkedList<Pattern>();
for (String str : getBodyPatternStringList()) {
bodyPatternList.add(Pattern.compile(str, Pattern.DOTALL));
}
}
}
}
return bodyPatternList;
}
/**
* 获得日期的正则表达式
*
* @return
*/
protected Pattern getDatetPattern() {
if (datetPattern == null) {
synchronized (this) {
if (datetPattern == null) {
datetPattern = Pattern.compile(getDatePatternString(), Pattern.DOTALL);
}
}
}
return datetPattern;
}
public String getTitlePatternString() {
return titlePatternString;
}
public void setTitlePatternString(String titlePatternString) {
this.titlePatternString = titlePatternString;
}
public String getAuthorPatternString() {
return authorPatternString;
}
public void setAuthorPatternString(String authorPatternString) {
this.authorPatternString = authorPatternString;
}
public String getDatePatternString() {
return datePatternString;
}
public void setDatePatternString(String datePatternString) {
this.datePatternString = datePatternString;
}
public List<String> getBodyPatternStringList() {
return bodyPatternStringList;
}
public void setBodyPatternStringList(List<String> bodyPatternStringList) {
this.bodyPatternStringList = bodyPatternStringList;
}
// 默认的标题正则
private static final Pattern titlePatternDefault = Pattern.compile("(?i)<title>(.*?)</title>", Pattern.DOTALL);
// 默认的作者正则
private static final Pattern authorPatternDefault = Pattern.compile("(?i)<meta name=\"Author\".*?content=\"(.+?)\".*?>",
Pattern.DOTALL);
// 是否去掉HTML标记
private boolean stripHtml = true;
public boolean isStripHtml() {
return stripHtml;
}
public void setStripHtml(boolean stripHtml) {
this.stripHtml = stripHtml;
}
// 引擎处理的字符集
private String charset = "UTF-8";
public void setCharset(String charset) {
this.charset = charset;
}
/**
* 得到网站的编码
*
* @return
*/
public String getCharset() {
return charset;
}
/**
* 去掉字符串里面的html代码。<br>
* 要求数据要规范,比如大于小于号要配套,否则会被集体误杀。
*
* @param content 内容
* @return 去掉后的内容
*/
public static String stripHtml(String content) {
// <p>段落替换为换行
content = content.replaceAll("(?i)<p.*?>", "\r\n");
// <h1>=<h9>段落替换为换行
content = content.replaceAll("(?i)<h\\d>", "\r\n");
// <br><br/><br />等替换为换行
content = content.replaceAll("(?i)<br\\s*/?>", "\r\n");
// 去掉其它的</>和<>之间的东西
content = content.replaceAll("<.*?/>", "");
content = content.replaceAll("<.*?>", "");
// 编码特殊字符
content = HTMLDecoder.decode(content);
content = net.java2000.tools.StrTools.htmlencode(content);
// 去掉多个换行
content = content.replaceAll("(\r?\n(\\s*\r?\n)+)", "\r\n");
return content;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -