📄 fetchedsegments.java
字号:
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.searcher;
import java.io.IOException;
import java.io.File;
import java.util.HashMap;
import org.apache.log4j.Logger;
import net.nutch.io.*;
import net.nutch.fs.*;
import net.nutch.fetcher.*;
import net.nutch.protocol.*;
import net.nutch.parse.*;
import net.nutch.indexer.*;
import kit.nlp.util.summary.*;
import kit.nlp.util.*;
import net.nutch.tools.StringUtils;
import com.jivesoftware.util.Cache;
import com.jivesoftware.util.CacheableObject;
/** Implements {@link HitSummarizer} and {@link HitContent} for a set of
* fetched segments. */
public class FetchedSegments implements HitSummarizer, HitContent {
private Cache cache = new Cache();
private int maxSize = 100*1024*1024;
private static class Segment {
private NutchFileSystem nfs;
private File segmentDir;
private ArrayFile.Reader fetcher;
private ArrayFile.Reader content;
private ArrayFile.Reader parsetext;
private ArrayFile.Reader parsedata;
public Segment(NutchFileSystem nfs, File segmentDir) throws IOException {
this.nfs = nfs;
this.segmentDir = segmentDir;
}
public File getSegmentDir(){
return segmentDir;
}
public FetcherOutput getFetcherOutput(int docNo) throws IOException {
if (fetcher == null) {
this.fetcher = new ArrayFile.Reader
(nfs, new File(segmentDir, FetcherOutput.DIR_NAME).toString());
}
FetcherOutput entry = new FetcherOutput();
fetcher.get(docNo, entry);
return entry;
}
public byte[] getContent(int docNo) throws IOException {
if (content == null) {
this.content = new ArrayFile.Reader
(nfs, new File(segmentDir, Content.DIR_NAME).toString());
}
Content entry = new Content();
content.get(docNo, entry);
return entry.getContent();
}
public ParseData getParseData(int docNo) throws IOException {
if (parsedata == null) {
this.parsedata = new ArrayFile.Reader
(nfs, new File(segmentDir, ParseData.DIR_NAME).toString());
}
ParseData entry = new ParseData();
parsedata.get(docNo, entry);
return entry;
}
public ParseText getParseText(int docNo) throws IOException {
synchronized(this){
if (parsetext == null) {
//System.out.println(">>>>>>>>>>>>> Open parse text file <<<<<<<<<<<<<");
this.parsetext = new ArrayFile.Reader
(nfs, new File(segmentDir, ParseText.DIR_NAME).toString());
}
}
ParseText entry = new ParseText();
parsetext.get(docNo, entry);
return entry;
}
// Add by Xie Shuqiang.2005-08-17
public void close() throws IOException {
if (fetcher != null)
fetcher.close();
if (content != null)
content.close();
if (parsetext != null)
parsetext.close();
if (parsedata != null)
parsedata.close();
}
}
private HashMap<String, Segment> segments = new HashMap<String, Segment>();
public static final Logger LOG = Logger.getLogger("lock");
/** Construct given a directory containing fetcher output. */
public FetchedSegments(NutchFileSystem nfs, String segmentsDir) throws IOException {
// ** segmentsDir 是 current today 一级的目录
File[] segmentDirs = nfs.listFiles(new File(segmentsDir));
if (segmentDirs != null) {
for (int i = 0; i < segmentDirs.length; i++) {
File segmentDir = segmentDirs[i];
File indexdone = new File(segmentDir, IndexSegment.IDX_DONE_NAME);
if (nfs.exists(indexdone) && nfs.isFile(indexdone)) {
segments.put(segmentDir.getName(), new Segment(nfs, segmentDir));
}
}
}
if(cache.getMaxSize() < maxSize){
cache.setMaxSize(maxSize);
}
}
public FetchedSegments(NutchFileSystem nfs, String[] segmentsDirs) throws IOException {
for (int i=0 ; i<segmentsDirs.length; i++){
File[] segmentDirs = nfs.listFiles(new File(segmentsDirs[i]));
if (segmentDirs != null) {
for (int j = 0; j < segmentDirs.length; j++) {
File segmentDir = segmentDirs[j];
File indexdone = new File(segmentDir, IndexSegment.IDX_DONE_NAME);
if (nfs.exists(indexdone) && nfs.isFile(indexdone)) {
segments.put(segmentDir.getName(), new Segment(nfs, segmentDir));
}
}
}
}
if(cache.getMaxSize() < maxSize){
cache.setMaxSize(maxSize);
}
}
public void addSegment(NutchFileSystem nfs,String segmentName) throws IOException {
NutchFileSystem fs = nfs;
if (fs == null){
fs = new LocalFileSystem();
}
File segmentDir = new File(segmentName);
File indexdone = new File(segmentDir, IndexSegment.IDX_DONE_NAME);
if (fs.exists(indexdone) && fs.isFile(indexdone)) {
segments.put(segmentDir.getName(), new Segment(fs, segmentDir));
}
}
// Add by Xie Shuqiang.2005-08-17
public void delSegment(String segmentName) throws IOException {
if (segmentName == null || segmentName.length() == 0)
return;
File segmentDir = new File(segmentName);
Segment segment = (Segment)segments.remove(segmentDir.getName());
if (segment != null){
//System.out.println("close!!!");
segment.close();
}
}
//Add by Xie Shuqiang.2005-08-17
public void delSegments(String segmentName[]) throws IOException {
if (segmentName == null)
return;
for(int i=0; i<segmentName.length; i++)
delSegment(segmentName[i]);
}
public String[] getSegmentNames() {
return (String[])segments.keySet().toArray(new String[segments.size()]);
}
public byte[] getContent(HitDetails details) throws IOException {
return getSegment(details).getContent(getDocNo(details));
}
public ParseData getParseData(HitDetails details) throws IOException {
return getSegment(details).getParseData(getDocNo(details));
}
public String[] getAnchors(HitDetails details) throws IOException {
return getSegment(details).getFetcherOutput(getDocNo(details))
.getFetchListEntry().getAnchors();
}
public long getFetchDate(HitDetails details) throws IOException {
return getSegment(details).getFetcherOutput(getDocNo(details))
.getFetchDate();
}
public ParseText getParseText(HitDetails details) throws IOException {
return getSegment(details).getParseText(getDocNo(details));
}
public String getSummary(HitDetails details, Query query, boolean fullTextSummary)
throws IOException {
int lock = SearcherLock.getLock();
int times = 0;
while (lock < 0) {
times++;
if (times > 20) {
throw new IOException("Can't get a lock!");
}
try {
Thread.sleep(100);
} catch (Exception e) {
throw new IOException(e.getMessage());
}
lock = SearcherLock.getLock();
}
LOG.info("getSummary(old): get lock :" + lock);
String text = "";
try{
Segment segment = getSegment(details);
int docNo = getDocNo(details);
ParseText ptext = segment.getParseText(docNo);
text = ptext.getText();
}catch(Exception e){
SearcherLock.unLock(lock);
LOG.info("getSummary(old): free lock :" + lock);
throw new IOException(e.getMessage());
}
SearcherLock.unLock(lock);
LOG.info("getSummary(old): free lock :" + lock);
if (!fullTextSummary)
return new Summarizer().getSummary(text, query).toString();
String title = details.getValue("title");
try{
int textLen = text.getBytes("GBK").length;
//String segTitle = WordsSegment.segment(title);
//String segText = WordsSegment.segment(text);
//String summary = FullTextSummary.summary(segTitle, segText);
String summary = FullTextSummary.summary(title, text);
int sumLen = summary.getBytes("GBK").length;
if (sumLen < 50 && textLen > 50){
LOG.warn("Summary Error:\ntitle:" + title + "\ntext:" + text + "\nsummary:" +summary);
}
return summary;
}catch(Exception e){
throw new IOException("Get Full Text Summary Error!" );
}
}
public String getSummaryNew(HitDetails details, Query query, int summaryType)
throws IOException {
int lock = SearcherLock.getLock();
int times = 0;
while (lock < 0) {
times++;
if (times > 20) {
throw new IOException("Can't get a lock!");
}
try {
Thread.sleep(100);
} catch (Exception e) {
throw new IOException(e.getMessage());
}
lock = SearcherLock.getLock();
}
LOG.info("getSummary(new): get lock :" + lock);
String text = "";
try{
long docid = getGlobDocNo(details);
CacheableObject cob = (CacheableObject)cache.get(docid);
if( cob == null || cob.getSize() == 0){
Segment segment = getSegment(details);
int docNo = getDocNo(details);
ParseText ptext = segment.getParseText(docNo);
text = ptext.getText();
if(docid >0)
Cache.add(docid, new CacheableObject(text));
//LOG.info("-------------------FetchedSegment get text form file");
}else{
text = (String)cob.getObject();
//LOG.info("------------------FetchedSegments get from cache docid="+docid+" cache size="+cache.getMaxSize());
}
}catch(Exception e){
SearcherLock.unLock(lock);
LOG.info("getSummary(new e): free lock :" + lock + " e="+e);
throw new IOException(e.getMessage());
}
SearcherLock.unLock(lock);
LOG.info("getSummary(new): free lock :" + lock);
/*
* 摘要类型:0 不做摘要;1 关键词摘要;2 全文摘要 3 返回全文
* 默认关键词摘要
*/
if (summaryType == 1)
return new Summarizer().getSummary(text, query).toString();
else if(summaryType == 3)
return StringUtils.trimAllWhitespace(text);
String title = details.getValue("title");
try{
int textLen = text.getBytes("GBK").length;
//String segTitle = WordsSegment.segment(title);
//String segText = WordsSegment.segment(text);
//String summary = FullTextSummary.summary(segTitle, segText);
String summary = FullTextSummary.summary(title, text);
int sumLen = summary.getBytes("GBK").length;
if (sumLen < 50 && textLen > 50){
LOG.warn("Summary Error:\ntitle:" + title + "\ntext:" + text + "\nsummary:" +summary);
}
return summary;
}catch(Exception e){
throw new IOException("Get Full Text Summary Error!" );
}
}
public String[] getSummary(HitDetails[] details, Query query, boolean fullTextSummary)
throws IOException {
String[] results = new String[details.length];
for (int i = 0; i < details.length; i++)
results[i] = getSummary(details[i], query, fullTextSummary);
return results;
}
public String[] getSummaryNew(HitDetails[] details, Query query, int summaryType)
throws IOException {
String[] results = new String[details.length];
for (int i = 0; i < details.length; i++)
results[i] = getSummaryNew(details[i], query, summaryType);
return results;
}
private Segment getSegment(HitDetails details) {
return (Segment)segments.get(details.getValue("segment"));
}
private int getDocNo(HitDetails details) {
return Integer.parseInt(details.getValue("docNo"), 16);
}
private long getGlobDocNo(HitDetails details) {
return Long.parseLong(details.getValue("docid"), 10);
}
public int getMaxSize() {
return maxSize;
}
public void setMaxSize(int maxSize) {
this.maxSize = maxSize;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -