📄 lucenepaodingstore.java

📁 网页采集系统 ================= 安装配置 ------- 1 程序我就不说了 2 配置文件 applicationContext.xml 里面有详细的注释 3 已经
💻 JAVA
字号:
package com.laozizhu.search.store;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.LockObtainFailedException;
import com.laozizhu.search.Item;
import com.laozizhu.search.ItemBase;
import com.laozizhu.search.SearchResult;
import com.laozizhu.search.SearchResultBase;
import com.laozizhu.search.Store;

/**
 * 基于庖丁解牛的Lucene 2.4的全文搜索代码。
 * 
 * @author 老紫竹研究室(laozizhu.com)
 */
public class LucenePaoDingStore implements Store {
  private static final Object lock = new Object();

  // 索引目录
  private String indexPath = "d:/indexpaoding/lzzSearch";

  public String getIndexPath() {
    return indexPath;
  }

  public void setIndexPath(String indexPath) {
    this.indexPath = indexPath;
  }

  /**
   * 最大的搜索结果数量
   */
  private int maxResult = 1000;

  public int getMaxResult() {
    return maxResult;
  }

  public void setMaxResult(int maxResult) {
    this.maxResult = maxResult;
  }

  public synchronized Analyzer getAnalyzer() {
    return new PaodingAnalyzer();
  }

  private int totalAdd = 0;

  // 添加超过这个数量，则优化一次索引
  private int totalToOptimize = 20;

  private static final String FIELD_URL = "url";

  private static final String FIELD_AUTHOR = "author";

  private static final String FIELD_TITLE = "title";

  private static final String FIELD_DATETIMECREATE = "datetimeCreate";

  private static final String FIELD_BODY = "body";

  private IndexWriter getIndexWriter() throws CorruptIndexException, LockObtainFailedException, IOException {
    return new IndexWriter(indexPath, getAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
  }

  public synchronized boolean save(Item item) {
    if (item == null || item.getAuthor() == null || item.getTitle() == null || item.getBody() == null
        || item.getDatetimeCreate() == null) {
      return false;
    }
    IndexWriter writer = null;
    synchronized (lock) {
      try {
        // 先删除以前的数据
        deleteByUrl(item.getUrl());
        // 增加数据
        writer = getIndexWriter();
        writer.setMaxFieldLength(10000000);
        Document doc = new Document();// 一个文档相当与表的一条记录
        doc.add(new Field(FIELD_URL, item.getUrl(), Field.Store.YES, Field.Index.ANALYZED));
        doc.add(new Field(FIELD_AUTHOR, item.getAuthor(), Field.Store.YES, Field.Index.ANALYZED));
        doc.add(new Field(FIELD_TITLE, item.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
        doc.add(new Field(FIELD_DATETIMECREATE, item.getDatetimeCreate(), Field.Store.YES, Field.Index.ANALYZED));
        doc.add(new Field(FIELD_BODY, item.getBody(), Field.Store.YES, Field.Index.ANALYZED));
        writer.addDocument(doc);
        totalAdd++;
        if (totalAdd > getTotalToOptimize()) {
          writer.optimize();// 优化
          totalAdd = 0;
        }
        return true;
      } catch (Exception e) {
        e.printStackTrace();
        return false;
      } finally {
        try {
          if (writer != null) {
            writer.close();
          }
        } catch (Exception e) {
          e.printStackTrace();
        }
      }
    }
  }

  public void optimize() {
    IndexWriter writer = null;
    synchronized (lock) {
      try {
        writer = getIndexWriter();
        writer.optimize();// 优化
      } catch (IOException e) {
        System.out.println(e);
      } finally {
        try {
          if (writer != null) {
            writer.close();
          }
        } catch (Exception e) {
          e.printStackTrace();
        }
      }
    }
  }

  public synchronized void deleteById(int id) {
    IndexReader indexReader = null;
    synchronized (lock) {
      try {
        indexReader = IndexReader.open(indexPath);
        indexReader.deleteDocument(id);
      } catch (IOException e) {} finally {
        if (indexReader != null) {
          try {
            indexReader.close();
          } catch (Exception e) {
            e.printStackTrace();
          }
        }
      }
    }
  }

  public synchronized void deleteByUrl(String url) {
    IndexReader indexReader = null;
    synchronized (lock) {
      try {
        indexReader = IndexReader.open(indexPath);
        Item item = searchByUrl(url);
        System.out.println("Delete..." + url);
        if (item != null) {
          indexReader.deleteDocument(item.getId());
          System.out.println("OK");
        }
      } catch (IOException e) {
        System.out.println(e);
      } finally {
        if (indexReader != null) {
          try {
            indexReader.close();
          } catch (Exception e) {
            e.printStackTrace();
          }
        }
      }
    }
  }

  public List<Item> findAll(int begin, int number) {
    IndexReader indexReader = null;
    List<Item> list = new LinkedList<Item>();
    try {
      indexReader = IndexReader.open(indexPath);
      Item item;
      Document doc;
      for (int i = begin;; i++) {
        if (indexReader.isDeleted(i)) {
          continue;
        }
        doc = indexReader.document(i);
        if (doc == null) {
          continue;
        }
        item = new ItemBase();
        item.setId(i);
        item.setUrl(doc.get(FIELD_URL));
        item.setAuthor(doc.get(FIELD_AUTHOR));
        item.setTitle(doc.get(FIELD_TITLE));
        item.setDatetimeCreate(doc.get(FIELD_DATETIMECREATE));
        list.add(item);
        number--;
        if (number <= 0) {
          break;
        }
      }
      return list;
    } catch (Exception e) {
      e.printStackTrace();
      return list;
    } finally {
      if (indexReader != null) {
        try {
          indexReader.close();
        } catch (Exception e) {
          e.printStackTrace();
        }
      }
    }
  }

  public Item searchById(int id) {
    IndexReader indexReader = null;
    try {
      indexReader = IndexReader.open(indexPath);
      Document doc = indexReader.document(id);
      Item item = null;
      if (doc != null) {
        item = new ItemBase();
        item.setId(id);
        item.setUrl(doc.get(FIELD_URL));
        item.setAuthor(doc.get(FIELD_AUTHOR));
        item.setTitle(doc.get(FIELD_TITLE));
        item.setDatetimeCreate(doc.get(FIELD_DATETIMECREATE));
        item.setBody(doc.get(FIELD_BODY));
      }
      return item;
    } catch (Exception e) {
      e.printStackTrace();
      return null;
    } finally {
      if (indexReader != null) {
        try {
          indexReader.close();
        } catch (Exception e) {
          e.printStackTrace();
        }
      }
    }
  }

  public Item searchByUrl(String url) {
    IndexSearcher isearcher = null;
    try {
      // 替换一些特殊字符，比如冒号等
      url = encodeURLForLucene(url);
      isearcher = new IndexSearcher(indexPath);
      QueryParser parser = new QueryParser(FIELD_URL, getAnalyzer());
      Query query = parser.parse(url);
      // 下面的这个方法已经不推荐使用了。
      // Hits hits = isearcher.search(query);
      // 改为如下的方式
      TopDocCollector collector = new TopDocCollector(1);
      isearcher.search(query, collector);
      ScoreDoc[] hits = collector.topDocs().scoreDocs;
      Item item = null;
      if (hits.length > 0) {
        Document doc = isearcher.doc(hits[0].doc);
        item = new ItemBase();
        item.setId(hits[0].doc);
        item.setUrl(doc.get(FIELD_URL));
        item.setAuthor(doc.get(FIELD_AUTHOR));
        item.setTitle(doc.get(FIELD_TITLE));
        item.setDatetimeCreate(doc.get(FIELD_DATETIMECREATE));
        item.setBody(doc.get(FIELD_BODY));
      }
      return item;
    } catch (Exception e) {
      e.printStackTrace();
      return null;
    } finally {
      if (isearcher != null) {
        try {
          isearcher.close();
        } catch (Exception e) {
          e.printStackTrace();
        }
      }
    }
  }

  public SearchResult seach(String queryString, int begin, int number) {// 根据关键字搜索
    if (queryString == null) {
      return null;
    }
    if (begin >= getMaxResult() || number < 1) {
      return null;
    }
    queryString = removeIllegleCharForLucene(queryString);
    if (queryString.length() < 1) {
      return null;
    }
    IndexSearcher isearcher = null;
    SearchResult sr = new SearchResultBase();
    try {
      isearcher = new IndexSearcher(indexPath);
      /* 下面这个表示要同时搜索这两个域，而且只要一个域里面有满足我们搜索的内容就行 */
      BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD };
      TopDocCollector collector = new TopDocCollector(getMaxResult());
      Query query = MultiFieldQueryParser.parse(queryString, new String[] { FIELD_TITLE, FIELD_BODY }, clauses, getAnalyzer());
      isearcher.search(query, collector);
      ScoreDoc[] hits = collector.topDocs().scoreDocs;
      sr.setTotal(hits.length);
      List<Item> rtn = new ArrayList<Item>();
      Set<String> urls = new HashSet<String>();
      Item o;
      // 用这个进行高亮显示，默认是<b>..</b>
      // 用这个指定<read>..</read>
      SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<read>", "</read>");
      // 构造高亮
      // 指定高亮的格式
      // 指定查询评分
      Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
      // 这个一般等于你要返回的，高亮的数据长度
      // 如果太小，则只有数据的开始部分被解析并高亮，且返回的数据也少
      // 太大，有时太浪费了。
      highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));
      for (int i = begin; i < hits.length && i < begin + number; i++) {
        Document doc = isearcher.doc(hits[i].doc);
        if (!urls.contains(doc.get(FIELD_URL))) {
          o = new ItemBase();
          o.setId(hits[i].doc);
          o.setUrl(doc.get(FIELD_URL));
          o.setAuthor(doc.get(FIELD_AUTHOR));
          o.setDatetimeCreate(doc.get(FIELD_DATETIMECREATE));
          // 有三个参数
          // 分析器
          // 要解析的字段名
          // 要解析的数据
          o.setTitle(highlighter.getBestFragment(getAnalyzer(), FIELD_TITLE, doc.get(FIELD_TITLE)));
          if (o.getTitle() == null) {
            o.setTitle(doc.get(FIELD_TITLE));
          }
          o.setBody(highlighter.getBestFragment(getAnalyzer(), FIELD_BODY, doc.get(FIELD_BODY)));
          if (o.getBody() == null) {
            o.setBody(doc.get(FIELD_BODY));
          }
          urls.add(o.getUrl());
          rtn.add(o);
        }
      }
      sr.setReturnList(rtn);
      return sr;
    } catch (Exception e) {
      e.printStackTrace();
      return null;
    } finally {
      if (isearcher != null) {
        try {
          isearcher.close();
        } catch (Exception e) {
          e.printStackTrace();
        }
      }
    }
  }

  /**
   * 对能引起Lucene搜索引起错误的字符进行处理.<br>
   * 目前只发现了冒号和问号。
   * 
   * @param url
   * @return
   */
  private static String encodeURLForLucene(String url) {
    return url.replace(":", "\\:").replace("?", "\\?");
  }

  private static String removeIllegleCharForLucene(String keyword) {
    return keyword.replaceAll("([:\\*\\?\\[\\]\\.\\$\\{\\}])", "");
  }

  public int getTotalToOptimize() {
    return totalToOptimize;
  }

  public void setTotalToOptimize(int totalToOptimize) {
    this.totalToOptimize = totalToOptimize;
  }
}
💿 文件大小 10612 K
👤 上传用户 chenqiyun1990
📂 所属分类 Java编程
🏷️ 相关标签

#applicationContext #xml #页 #采集系统
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -