📄 indexsearcher.java
字号:
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.searcher;
import java.io.IOException;
import java.io.File;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.log4j.Logger;
import net.nutch.util.*;
import net.nutch.indexer.*;
/**
* Implements {@link Searcher} and {@link HitDetailer} for either a single
* merged index, or for a set of individual segment indexes.
*/
public class IndexSearcher implements Searcher, HitDetailer {
public static final Logger LOG = Logger.getLogger("lock");
private Map<String, org.apache.lucene.search.IndexSearcher> searchers = new
HashMap<String, org.apache.lucene.search.IndexSearcher>();
// private String[] sites;
private LuceneQueryOptimizer optimizer = new LuceneQueryOptimizer(NutchConf
.getInt("searcher.filter.cache.size", 16), NutchConf.getFloat(
"searcher.filter.cache.threshold", 0.05f));
//private SearcherLock searchLock = new SearcherLock(200);
/** Construct given a number of indexed segments. */
public IndexSearcher(File[] segmentDirs) throws IOException {
//** Modified by DingZhenbo 2007-09-11
List<IndexReader> readerList = new ArrayList<IndexReader>();
//IndexReader[] readers = new IndexReader[segmentDirs.length];
File dir;
for (int i = 0; i < segmentDirs.length; i++) {
//* process optimized index
dir = new File(segmentDirs[i], IndexSegment.IDX_RAM_NAME);
if (dir.exists() && dir.isDirectory()) {
readerList.add(IndexReader.open(new RAMDirectory(dir)));
IndexReader[] readers = readerList.toArray(new IndexReader[0]);
System.out.println("RAM " + readers.length);
init(new MultiReader(readers), DistributedSearch.MODE_RAM);
System.out.println("init RAM searcher ok");
}
if (readerList.size() > 0) {
readerList.clear();
}
dir = new File(segmentDirs[i], IndexSegment.IDX_REG_NAME);
if (dir.exists() && dir.isDirectory()) {
readerList.add(IndexReader.open(dir));
IndexReader[] readers = readerList.toArray(new IndexReader[0]);
System.out.println("REG " + readers.length);
init(new MultiReader(readers), DistributedSearch.MODE_REG);
System.out.println("init REG searcher ok");
}
}
}
/**
* Construct given a directory containing fetched segments, and a separate
* directory naming their merged index.
*/
public IndexSearcher(String index) throws IOException {
File file = new File(index);
if (!(file.exists() && file.isDirectory())) {
LOG.warn("Path "+index+" not exist.");
return;
}
if (file.getName().endsWith(IndexSegment.IDX_RAM_NAME)) {
init(IndexReader.open(index), DistributedSearch.MODE_RAM);
}
else if (file.getName().endsWith(IndexSegment.IDX_REG_NAME)) {
init(IndexReader.open(index), DistributedSearch.MODE_REG);
}
}
/**
* Modified by DingZhenbo
* @param reader lucene IndexReader
* @param opt 是否需要优化查询
* @throws IOException
*/
private void init(IndexReader reader, String mode) throws IOException {
// this.sites = FieldCache.DEFAULT.getStrings(reader, "site");
/** ******* Add by shuqiang xie. 2005-9-19 ************ */
FieldCache.DEFAULT.getSiteRank(reader);
FieldCache.DEFAULT.getInts(reader, "pubTime");
FieldCache.DEFAULT.getInts(reader, "gid");
FieldCache.DEFAULT.getInts(reader, "cid");
org.apache.lucene.search.IndexSearcher searcher = new org.apache.lucene.search.IndexSearcher(reader);
searcher.setSimilarity(new NutchSimilarity());
/*
* sortType = 2时,设定评分阀值,默认为0.1
*
*/
searcher.setScoreLimit(NutchConf.getFloat("searcher.score.limit.value",0.1f));
searchers.put(mode, searcher);
}
public void delDoc(int docID) throws IOException {
SearcherLock.stop();
int times = 0;
while (!SearcherLock.isAllUnlock()){
times++;
if (times > 5) {
SearcherLock.start();
throw new IOException("Del doc: After 5 seconds, Still have locked locks!");
}
// searchLock.start();
try {
Thread.sleep(1000);
} catch (Exception e) {
SearcherLock.start();
return;
}
// searchLock.stop();
}
;
try{
org.apache.lucene.search.IndexSearcher searcher = null;
// Delete doc in RAM mode
searcher = searchers.get(DistributedSearch.MODE_RAM);
if (searcher != null) {
searcher.delDoc(docID);
}
// Delete doc in REG mode
searcher = searchers.get(DistributedSearch.MODE_REG);
if (searcher != null) {
searcher.delDoc(docID);
}
}catch(Exception e){
SearcherLock.start();
throw new IOException(e.getMessage());
}
SearcherLock.start();
}
public boolean addSegment(File segmentDir, String mode) throws IOException {
File indexDir = null;
IndexReader reader = null;
boolean r = true;
// 判断类型
if (mode.equals(DistributedSearch.MODE_MIX)
|| mode.equals(DistributedSearch.MODE_RAM)) {
indexDir = new File(segmentDir, IndexSegment.IDX_RAM_NAME);
if (indexDir.exists()) {
//如果是内存优化的模式的就用内存加载
reader = IndexReader.open(new RAMDirectory(indexDir));
LOG.info("Open index reader at "+indexDir.getName());
try {
addSegment(reader, DistributedSearch.MODE_RAM);
LOG.info("Add new segment ["+segmentDir+"] success.");
} catch (Exception ex) {
r = false;
LOG.error("Add new segment ["+segmentDir+"] failed : "+ex.getMessage());
}
}
}
if (mode.equals(DistributedSearch.MODE_MIX)
|| mode.equals(DistributedSearch.MODE_REG)) {
indexDir = new File(segmentDir, IndexSegment.IDX_REG_NAME);
if (indexDir.exists()) {
reader = IndexReader.open(indexDir);
LOG.info("Open index reader at "+indexDir.getName());
try {
addSegment(reader, DistributedSearch.MODE_REG);
LOG.info("Add new segment ["+segmentDir+"] success.");
} catch (Exception ex) {
r = false;
LOG.error("Add new segment ["+segmentDir+"] failed : "+ex.getMessage());
}
}
}
return r;
}
private void addSegment(IndexReader reader, String mode) throws IOException {
SearcherLock.stop();
int times = 0;
while (!SearcherLock.isAllUnlock()) {
times++;
if (times > 5) {
SearcherLock.start();
throw new IOException("Add segment:After 5 seconds, Still have locked locks!");
}
// searchLock.start();
try {
Thread.sleep(1000);
} catch (Exception e) {
SearcherLock.start();
return;
}
// searchLock.stop();
}
try {
if (reader != null) {
org.apache.lucene.search.IndexSearcher searcher;
searcher = searchers.get(mode);
searcher.addReader(reader);
return;
}
} catch (Exception e) {
SearcherLock.start();
throw new IOException(e.getMessage());
}
SearcherLock.start();
}
public boolean delSegments(String[] segments, String mode) throws IOException {
SearcherLock.stop();
int times = 0;
while (!SearcherLock.isAllUnlock()) {
times++;
if (times > 5) {
SearcherLock.start();
throw new IOException("Delete segments:After 5 seconds, Still have locked locks!");
}
try {
// searchLock.start();
Thread.sleep(1000);
// searchLock.stop();
} catch (Exception e) {
SearcherLock.start();
return false;
}
}
org.apache.lucene.search.IndexSearcher searcher = null;
boolean b = true;
if (mode.equals(DistributedSearch.MODE_MIX) || mode.equals(DistributedSearch.MODE_RAM)) {
searcher = searchers.get(DistributedSearch.MODE_RAM);
if (searcher != null) {
try {
searcher.delReaders(segments);
} catch (Exception e) {
b = b & false;
LOG.error("delete RAM segments error: "+e.getMessage());
}
}
}
if (mode.equals(DistributedSearch.MODE_MIX) || mode.equals(DistributedSearch.MODE_REG)) {
searcher = searchers.get(DistributedSearch.MODE_REG);
if (searcher != null) {
try {
searcher.delReaders(segments);
} catch (Exception e) {
//SearcherLock.start();
b = b & false;
LOG.error("delete REG segments error: "+e.getMessage());
}
}
}
SearcherLock.start();
return b;
}
/***************************************************************************
* public void mergeSegments(String[] segments, String newSegment) throws
* IOException { //searchLock.stop(); //while( !searchLock.isAllUnlock() );
* addSegment(new File(newSegment)); //luceneSearcher.delReaders(segments);
* //searchLock.start(); delSegments(segments); }
**************************************************************************/
public long getMaxdocs(String mode) throws IOException {
org.apache.lucene.search.IndexSearcher searcher = null;
searcher = searchers.get(mode);
if (searcher != null) {
return searcher.maxDoc();
} else {
return -1;
}
}
/**
* Add by liubin.2006-02-14
* @param mode 该模式参数不接受MIX 如果是MIX会自动转为RAM查询
*/
public Hits search(Query query, int numHits, int sortType, long start, long end, String mode) throws IOException {
org.apache.lucene.search.BooleanQuery luceneQuery = QueryFilters
.filter(query);
int lock = SearcherLock.getLock();
int times = 0;
while (lock < 0) {
times++;
if (times > 20) {
throw new IOException("Can't get a lock!");
}
try {
Thread.sleep(100);
} catch (Exception e) {
throw new IOException(e.getMessage());
}
lock = SearcherLock.getLock();
}
LOG.info("search1: get lock :" + lock + " |query:" + query.getQueryStr());
TopDocs topDocs = null;
//为避免出错加的检查
if (mode.equals(DistributedSearch.MODE_MIX)) {
mode = DistributedSearch.MODE_RAM;
}
org.apache.lucene.search.IndexSearcher searcher = searchers.get(mode);
try {
topDocs = optimizer.optimize(luceneQuery, searcher, numHits, sortType, start, end);
} catch (Exception e) {
SearcherLock.unLock(lock);
LOG.info("search1: free lock :" + lock+" mode:"+mode);
e.printStackTrace();
throw new IOException(e.toString());
}
SearcherLock.unLock(lock);
LOG.info("search1: free lock :" + lock);
return translateHits(topDocs, sortType);
}
public Hits search(Query query, int numHits, int sortType, String mode) throws IOException {
return search(query, numHits, sortType, 0, 0, mode);
}
public Hits search(Query query, int numHits, String mode) throws IOException {
return search(query, numHits, mode);
}
public String getExplanation(Query query, Hit hit) throws IOException {
int lock = SearcherLock.getLock();
int times = 0;
while (lock < 0) {
times++;
if (times > 20) {
throw new IOException("Can't get a lock!");
}
try {
Thread.sleep(100);
} catch (Exception e) {
throw new IOException(e.getMessage());
}
lock = SearcherLock.getLock();
}
LOG.info("getExplanation: get lock :" + lock);
Explanation explanation;
org.apache.lucene.search.IndexSearcher searcher = searchers.get(hit.getMode());
try {
explanation = searcher.explain(QueryFilters.filter(query),
hit.getIndexDocNo());
} catch (Exception e) {
SearcherLock.unLock(lock);
LOG.info("getExplanation: free lock :" + lock);
throw new IOException(e.toString());
}
SearcherLock.unLock(lock);
LOG.info("getExplanation: free lock :" + lock);
return explanation.toHtml();
}
public HitDetails getDetails(Hit hit) throws IOException {
ArrayList fields = new ArrayList();
ArrayList values = new ArrayList();
int lock = SearcherLock.getLock();
int times = 0;
while (lock < 0) {
times++;
if (times > 20) {
throw new IOException("Can't get a lock!");
}
try {
Thread.sleep(100);
} catch (Exception e) {
throw new IOException(e.getMessage());
}
lock = SearcherLock.getLock();
}
LOG.info("getDetails: get lock :" + lock);
org.apache.lucene.search.IndexSearcher searcher = searchers.get(hit.getMode());
Document doc;
try {
doc = searcher.doc(hit.getIndexDocNo());
} catch (Exception e) {
SearcherLock.unLock(lock);
LOG.info("getDetails : free lock :" + lock+" eror: "+e.getMessage());
throw new IOException(e.toString());
}
SearcherLock.unLock(lock);
LOG.info("getDetails: free lock :" + lock);
Enumeration e = doc.fields();
while (e.hasMoreElements()) {
Field field = (Field) e.nextElement();
fields.add(field.name());
values.add(field.stringValue());
}
return new HitDetails((String[]) fields.toArray(new String[fields
.size()]), (String[]) values.toArray(new String[values.size()]));
}
public HitDetails[] getDetails(Hit[] hits) throws IOException {
HitDetails[] results = new HitDetails[hits.length];
for (int i = 0; i < hits.length; i++)
results[i] = getDetails(hits[i]);
return results;
}
private Hits translateHits(TopDocs topDocs, int sortType)
throws IOException {
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
int length = scoreDocs.length;
//System.out.println(length);
Hit[] hits = new Hit[length];
for (int i = 0; i < length; i++) {
int doc = scoreDocs[i].doc;
if (sortType == 0 || sortType == 3 || sortType == 4 || sortType == 5)// 相关度排序,不排序, 优化排序
hits[i] = new Hit(doc, scoreDocs[i].score, scoreDocs[i].grpNo,1);
else if(sortType == 1 || sortType == 2 )
hits[i] = new Hit(doc, scoreDocs[i].score, scoreDocs[i].grpNo, 1,
((Integer) ((FieldDoc) scoreDocs[i]).fields[0]).intValue());
if (sortType == 5){
//System.out.println("********grpDocs:" + scoreDocs[i].grpDocs);
//System.out.println("********siteRankSum:" + scoreDocs[i].siteRankSum);
hits[i].setGrpDocs(scoreDocs[i].grpDocs);
hits[i].setSiteRankSum(scoreDocs[i].siteRankSum);
}
hits[i].setClustNo(scoreDocs[i].clustNo);
hits[i].setSiteRank(scoreDocs[i].siteRank);
}
return new Hits(topDocs.totalHits, hits);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -