📄 searchbean.java
字号:
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.searcher;
import java.io.*;
import java.util.*;
import org.apache.log4j.*;
import javax.servlet.ServletContext;
import net.nutch.fs.*;
import net.nutch.util.*;
import net.nutch.parse.*;
import net.nutch.indexer.*;
import net.nutch.searcher.Query.Clause;
/**
* One stop shopping for search-related functionality.
*
* @version $Id: SearchBean.java,v 1.21 2006/12/01 10:15:15 zhu_yy Exp $
*/
public class SearchBean implements Searcher, HitDetailer, HitSummarizer, HitContent {
public static final Logger LOG = Logger.getLogger("search");
static {
LogFormatter.setShowThreadIDs(true);
}
// private String[] segmentNames;
private ArrayList segmentNames = new ArrayList();
private ArrayList segmentPaths;
private Searcher searcher;
private HitDetailer detailer;
private HitSummarizer summarizer;
private HitContent content;
private IndexSearcher indexSearcher;
private FetchedSegments segments;
private DistributedSearch.Client client;
private float RAW_HITS_FACTOR = NutchConf.getFloat(
"searcher.grouping.rawhits.factor", 10.0f);
private int HITS_PER_PAGE = 10;
private int PAGES_PER_SEARCH = 10;
private static int MAX_RESULT_HITS = 3000;
/**
* BooleanQuery won't permit more than 32 required/prohibited clauses. We
* don't want to use too many of those.
*/
private static final int MAX_PROHIBITED_TERMS = 20;
/** Cache in servlet context. */
public static SearchBean get(ServletContext app) throws IOException {
SearchBean bean = (SearchBean) app.getAttribute("searchBean");
if (bean == null) {
LOG.info("creating new bean");
bean = new SearchBean();
app.setAttribute("searchBean", bean);
}
return bean;
}
/** Construct reading from connected directory. */
public SearchBean() throws IOException {
this(new File(NutchConf.get("searcher.dir", ".")));
}
/** Construct in a named directory. */
public SearchBean(File dir) throws IOException {
File servers = new File(dir, "search-servers.txt");
if (servers.exists()) {
LOG.info("searching servers in " + servers.getCanonicalPath());
client = new DistributedSearch.Client(servers);
init(client);
} else {
init(new File(dir, "index"), new File(dir, "segments"));
}
}
public SearchBean(String[] dirs) throws IOException {
Arrays.sort(dirs);
segmentPaths = new ArrayList();
for (int i = 0; i < dirs.length; i++)
segmentPaths.add(dirs[i]);
Vector vDirs = new Vector();
for (int i = 0; i < dirs.length; i++) {
File segmentsDir = new File(dirs[i]);
// File [] directories = segmentsDir.listFiles();
String[] segmentStr = segmentsDir.list();
Arrays.sort(segmentStr);
for (int j = 0; j < segmentStr.length; j++) {
File segmentFile = new File(segmentsDir, segmentStr[j]);
File indexdone = new File(segmentFile, IndexSegment.IDX_DONE_NAME);
if (indexdone.exists() && indexdone.isFile()) {
vDirs.add(segmentFile);
}
}
}
File[] directories = new File[vDirs.size()];
for (int i = 0; vDirs.size() > 0; i++) {
directories[i] = (File) vDirs.remove(0);
}
indexSearcher = new IndexSearcher(directories);
segments = new FetchedSegments(new LocalFileSystem(), dirs);
String[] names = segments.getSegmentNames();
if (names != null) {
for (int i = 0; i < names.length; i++) {
this.segmentNames.add(names[i]);
}
}
// this.segmentNames = segments.getSegmentNames();
// for(int i=0;i<this.segmentNames.length; i++)
// System.out.println("*********segment:"+this.segmentNames[i]);
this.searcher = indexSearcher;
this.detailer = indexSearcher;
this.summarizer = segments;
this.content = segments;
}
/**
* 访问本地索引
*
* @param dir
* @throws IOException
*/
public SearchBean(String dir) throws IOException {
SearcherLock.init();
segmentPaths = new ArrayList();
segmentPaths.add(dir);
File path = new File(dir);
// System.out.println("****************");
File indexDir = new File(path, IndexSegment.IDX_RAM_NAME);
if (!indexDir.exists()) {
indexDir = new File(path, IndexSegment.IDX_REG_NAME);
}
init(indexDir, path);
// System.out.println("=============");
}
private void init(File indexDir, File segmentsDir) throws IOException {
// IndexSearcher indexSearcher;
if (indexDir.exists()) {
LOG.info("opening merged index in " + indexDir.getCanonicalPath());
indexSearcher = new IndexSearcher(indexDir.getCanonicalPath());
} else {
LOG.info("opening segment indexes in " + segmentsDir.getCanonicalPath());
Vector vDirs = new Vector();
// File [] directories = segmentsDir.listFiles();
String[] segmentsStr = segmentsDir.list();
Arrays.sort(segmentsStr);
for (int i = 0; i < segmentsStr.length; i++) {
File segmentFile = new File(segmentsDir, segmentsStr[i]);
File indexdone = new File(segmentFile, IndexSegment.IDX_DONE_NAME);
if (indexdone.exists() && indexdone.isFile()) {
vDirs.add(segmentFile);
}
}
File[] directories = new File[vDirs.size()];
for (int i = 0; vDirs.size() > 0; i++) {
directories[i] = (File) vDirs.remove(0);
}
indexSearcher = new IndexSearcher(directories);
}
segments = new FetchedSegments(new LocalFileSystem(), segmentsDir
.toString());
String[] names = segments.getSegmentNames();
if (names != null) {
for (int i = 0; i < names.length; i++) {
this.segmentNames.add(names[i]);
}
}
// this.segmentNames = segments.getSegmentNames();
// for(int i=0;i<this.segmentNames.size(); i++)
// System.out.println("*********segment:"+this.segmentNames.get(i));
this.searcher = indexSearcher;
this.detailer = indexSearcher;
this.summarizer = segments;
this.content = segments;
}
private void init(DistributedSearch.Client client) throws IOException {
String[] names = null;
// 查看是否使用 RAM 模式
int c = client.getAddressCount(DistributedSearch.MODE_RAM);
if (c > 0) {
names = client.getSegmentNames(DistributedSearch.MODE_RAM);
}
else {
names = client.getSegmentNames(DistributedSearch.MODE_REG);
}
if (names != null) {
for (int i = 0; i < names.length; i++) {
this.segmentNames.add(names[i]);
}
}
// this.segmentNames = client.getSegmentNames();
this.searcher = client;
this.detailer = client;
this.summarizer = client;
this.content = client;
}
public void setServerStat(String hostip, int port, boolean stat) {
client.setServerStat(hostip, port, stat);
}
public boolean delDoc(int docID) {
try {
indexSearcher.delDoc(docID);
} catch (Exception e) {
LOG.error("Delete document :" + docID + " Error:" + e.toString());
return false;
}
return true;
}
public boolean delDoc(Hit hit) throws IOException {
return client.delDoc(hit);
}
public String getHostByHit(Hit hit) {
return client.getHostByHit(hit);
}
public synchronized boolean addSegment(String segmentName, String mode) {
File newSegment = null;
String newSegmentName = null;
if (segmentName.indexOf(File.separator) < 0) {// is a segment name
// System.out.println("************a single segment name");
int i = 0;
// 判断是已经存在的selegment 如果不存在就返回false
// 如果文件系统因为一些异常没有正常拷贝segment,这里也会返回false,导致整个addSegment中间会有断层
for (; i < segmentPaths.size(); i++) {
newSegment = new File((String) segmentPaths.get(i), segmentName);
if (newSegment.exists())
break;
}
if (i == segmentPaths.size()) {
LOG.info("new segment: "+ newSegment + " | segmentPaths "+segmentPaths.size());
return false;
}
newSegmentName = segmentName;
} else {
LOG.info(">>> addSegment - segment name with path : "+segmentName);
newSegment = new File(segmentName);
if (!newSegment.exists())
return false;
newSegmentName = newSegment.getName();
segmentPaths.add(newSegment.getParent());
}
for (int i = 0; i < segmentNames.size(); i++) {
if (newSegmentName.equals((String) segmentNames.get(i)))
return true;
}
boolean b = true;
try {
// System.out.println("++++++++++++++Add
// segment:"+newSegment.getAbsolutePath());
b = indexSearcher.addSegment(newSegment, mode);
if (b) {
this.segments.addSegment(new LocalFileSystem(), newSegment
.getAbsolutePath());
this.segmentNames.add(newSegmentName);
}
} catch (Exception e) {
LOG.info("*********Add segment :" + newSegment.getAbsolutePath()
+ " error!");
LOG.info(e.toString());
e.printStackTrace(System.out);
return false;
}
return b;
}
public void closeSegments(String[] segments) throws IOException {
this.segments.delSegments(segments);
}
/**
* Refactored from <code>delReaders</code> -> <code>delSegments</code>
* 在合并短桶到与长筒同样的服务器和端口之后针对segment的add和delete都是同步的
* 这时候mode参数应该都是ALL
*
* @param segments
* @param mode
* @return
* @throws IOException
*/
public synchronized boolean delSegments(String[] segments, String mode)
throws IOException {
boolean r = true;
try {
// for(int i=0; i<segments.length; i++){
// System.out.println("*******Delete segment:"+segments[i]);
// }
r = indexSearcher.delSegments(segments, mode);
if ( r ) {
this.segments.delSegments(segments);
for (int i = 0; i < segments.length; i++) {
int len = this.segmentNames.size();
for (int j = 0; j < len; j++) {
if (segments[i].equals((String) this.segmentNames.get(j))) {
// System.out.println("*******delete "+ segments[i] + "
// from segmentNames");
this.segmentNames.remove(j);
break;
}
}
}
}
} catch (Exception e) {
LOG.warn("**********Del segment :" + segments + " error");
LOG.warn(e.toString());
return false;
}
return r;
}
public boolean mergeSegments(String[] segments, String newSeg, String mode)
throws IOException {
try {
/*******************************************************************
* indexSearcher.mergeSegments(segments,newSeg);
* this.segmentNames.add(newSeg); for (int i=0; i<segments.length;
* i++){ int len = this.segmentNames.size(); for (int j=0; j<len;
* j++){ if (segments[i].equals((String)this.segmentNames.get(j))){
* System.out.println("*******delete "+ segments[i] + " from
* segmentNames"); this.segmentNames.remove(j); break; } } }
******************************************************************/
if (!delSegments(segments, mode)) {
LOG.error(">>> Merge Segments delReaders Error");
return false;
}
if (!addSegment(newSeg, mode)) {
LOG.error(">>> Merge Segments addSegment Error");
return false;
}
} catch (Exception e) {
LOG.error("Merge Segments Error!" + e.toString());
return false;
}
LOG.info("Merge segments to " + newSeg + "success!");
return true;
}
public synchronized long getMaxDocs(String mode) throws IOException {
return indexSearcher.getMaxdocs(mode);
}
public boolean delDoc(int docID, String host, String port)
throws IOException {
return client.delDoc(docID, host, port);
}
public boolean delDoc(int docID, String host) throws IOException {
return client.delDoc(docID, host, "7000");
}
public long getMaxDocs(String host, int port) throws IOException {
return client.getMaxDocs(host, port);
}
public boolean addSegment(String segmentName, String host, String port)
throws IOException {
for (int i = 0; i < segmentNames.size(); i++) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -