⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 commonengine.java

📁 网页采集系统 ================= 安装配置 ------- 1 程序我就不说了 2 配置文件 applicationContext.xml 里面有详细的注释 3 已经
💻 JAVA
字号:
package com.laozizhu.search.impl;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.laozizhu.search.Engine;
import com.laozizhu.search.Item;
import com.laozizhu.search.ItemBase;
import com.laozizhu.search.util.HTMLDecoder;
import com.laozizhu.search.util.PageService;

/**
 * 搜索引擎的基础实现。<br>
 * 可以用来调度解析的整个过程。<br>
 * 部分细节可以在子类重写一些方法来实现。
 * 
 * @author 老紫竹(laozizhu.com)
 */
public class CommonEngine implements Engine {
  /**
   * 在得到正文之后,可以进行一些自定义的处理。
   * 
   * @param body 解析后的未经处理的文本。
   * @return 处理后的文本
   */
  public String afterBody(String body) {
    return body;
  }

  public String afterTitle(String title) {
    return title;
  }

  /**
   * 解析URL
   */
  public Item parseItem(String url) {
    String str = PageService.getPage(url, getCharset());
    Item item = new ItemBase();
    item.setUrl(url);
    Matcher m = getTitlePattern().matcher(str);
    if (m.find()) {
      item.setTitle(afterTitle(m.group(1).trim()));
    }
    m = getAuthorPattern().matcher(str);
    if (m.find()) {
      item.setAuthor(m.group(1).trim());
    }
    m = getDatetPattern().matcher(str);
    if (m.find()) {
      int num = m.groupCount();
      String date;
      if (num > 1) {
        StringBuilder b = new StringBuilder();
        for (int i = 1; i <= num; i++) {
          b.append(m.group(i));
        }
        date = b.toString();
      } else {
        date = m.group(1).trim();
      }
      item.setDatetimeCreate(date);
    }
    StringBuilder b = new StringBuilder();
    Iterator<Pattern> it = getBodyPatternList().iterator();
    while (it.hasNext()) {
      m = it.next().matcher(str);
      while (m.find()) {
        b.append(m.group(1).trim());
        b.append("<hr class='laozizhu'/>");
      }
    }
    item.setBody(afterBody(b.toString()));
    if (isStripHtml()) {
      item.setBody(stripHtml(item.getBody()));
    }
    return item;
  }

  private String titlePatternString;

  private String authorPatternString;

  private String datePatternString;

  private List<String> bodyPatternStringList;

  private Pattern titlePattern;

  private Pattern authorPattern;

  private Pattern datetPattern;

  private List<Pattern> bodyPatternList;

  /**
   * 得到标题的正则表达式。
   * 
   * @return
   */
  protected Pattern getTitlePattern() {
    if (titlePattern == null) {
      synchronized (this) {
        if (titlePattern == null) {
          if (getTitlePatternString() == null) {
            titlePattern = titlePatternDefault;
          } else {
            titlePattern = Pattern.compile(getTitlePatternString(), Pattern.DOTALL);
          }
        }
      }
    }
    return titlePattern;
  }

  /**
   * 获得作者的正则表达式
   * 
   * @return
   */
  protected Pattern getAuthorPattern() {
    if (authorPattern == null) {
      synchronized (this) {
        if (authorPattern == null) {
          if (getAuthorPatternString() == null) {
            authorPattern = authorPatternDefault;
          } else {
            authorPattern = Pattern.compile(getAuthorPatternString(), Pattern.DOTALL);
          }
        }
      }
    }
    return authorPattern;
  }

  /**
   * 获得主题的正则表达式列表
   * 
   * @return
   */
  protected List<Pattern> getBodyPatternList() {
    if (bodyPatternList == null) {
      synchronized (this) {
        if (bodyPatternList == null) {
          bodyPatternList = new LinkedList<Pattern>();
          for (String str : getBodyPatternStringList()) {
            bodyPatternList.add(Pattern.compile(str, Pattern.DOTALL));
          }
        }
      }
    }
    return bodyPatternList;
  }

  /**
   * 获得日期的正则表达式
   * 
   * @return
   */
  protected Pattern getDatetPattern() {
    if (datetPattern == null) {
      synchronized (this) {
        if (datetPattern == null) {
          datetPattern = Pattern.compile(getDatePatternString(), Pattern.DOTALL);
        }
      }
    }
    return datetPattern;
  }

  public String getTitlePatternString() {
    return titlePatternString;
  }

  public void setTitlePatternString(String titlePatternString) {
    this.titlePatternString = titlePatternString;
  }

  public String getAuthorPatternString() {
    return authorPatternString;
  }

  public void setAuthorPatternString(String authorPatternString) {
    this.authorPatternString = authorPatternString;
  }

  public String getDatePatternString() {
    return datePatternString;
  }

  public void setDatePatternString(String datePatternString) {
    this.datePatternString = datePatternString;
  }

  public List<String> getBodyPatternStringList() {
    return bodyPatternStringList;
  }

  public void setBodyPatternStringList(List<String> bodyPatternStringList) {
    this.bodyPatternStringList = bodyPatternStringList;
  }

  // 默认的标题正则
  private static final Pattern titlePatternDefault = Pattern.compile("(?i)<title>(.*?)</title>", Pattern.DOTALL);

  // 默认的作者正则
  private static final Pattern authorPatternDefault = Pattern.compile("(?i)<meta name=\"Author\".*?content=\"(.+?)\".*?>",
      Pattern.DOTALL);

  // 是否去掉HTML标记
  private boolean stripHtml = true;

  public boolean isStripHtml() {
    return stripHtml;
  }

  public void setStripHtml(boolean stripHtml) {
    this.stripHtml = stripHtml;
  }

  // 引擎处理的字符集
  private String charset = "UTF-8";

  public void setCharset(String charset) {
    this.charset = charset;
  }

  /**
   * 得到网站的编码
   * 
   * @return
   */
  public String getCharset() {
    return charset;
  }

  /**
   * 去掉字符串里面的html代码。<br>
   * 要求数据要规范,比如大于小于号要配套,否则会被集体误杀。
   * 
   * @param content 内容
   * @return 去掉后的内容
   */
  public static String stripHtml(String content) {
    // <p>段落替换为换行
    content = content.replaceAll("(?i)<p.*?>", "\r\n");
    // <h1>=<h9>段落替换为换行
    content = content.replaceAll("(?i)<h\\d>", "\r\n");
    // <br><br/><br />等替换为换行
    content = content.replaceAll("(?i)<br\\s*/?>", "\r\n");
    // 去掉其它的</>和<>之间的东西
    content = content.replaceAll("<.*?/>", "");
    content = content.replaceAll("<.*?>", "");
    // 编码特殊字符
    content = HTMLDecoder.decode(content);
    content = net.java2000.tools.StrTools.htmlencode(content);
    // 去掉多个换行
    content = content.replaceAll("(\r?\n(\\s*\r?\n)+)", "\r\n");
    return content;
  }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -