⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fetchedsegments.java

📁 nutch搜索的改进型工具和优化爬虫的相关工具
💻 JAVA
字号:
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.searcher;

import java.io.IOException;
import java.io.File;

import java.util.HashMap;

import org.apache.log4j.Logger;

import net.nutch.io.*;
import net.nutch.fs.*;
import net.nutch.fetcher.*;
import net.nutch.protocol.*;
import net.nutch.parse.*;
import net.nutch.indexer.*;
import kit.nlp.util.summary.*;
import kit.nlp.util.*;
import net.nutch.tools.StringUtils;
import com.jivesoftware.util.Cache;
import com.jivesoftware.util.CacheableObject;

/** Implements {@link HitSummarizer} and {@link HitContent} for a set of
 * fetched segments. */
public class FetchedSegments implements HitSummarizer, HitContent {
	
	private Cache cache = new Cache();
	private  int maxSize = 100*1024*1024;

  private static class Segment {
    private NutchFileSystem nfs;
    private File segmentDir;

    private ArrayFile.Reader fetcher;
    private ArrayFile.Reader content;
    private ArrayFile.Reader parsetext;
    private ArrayFile.Reader parsedata;


    public Segment(NutchFileSystem nfs, File segmentDir) throws IOException {
      this.nfs = nfs;
      this.segmentDir = segmentDir;
    }

    public File getSegmentDir(){
    	return segmentDir;
    }
    public FetcherOutput getFetcherOutput(int docNo) throws IOException {
      if (fetcher == null) { 
        this.fetcher = new ArrayFile.Reader
          (nfs, new File(segmentDir, FetcherOutput.DIR_NAME).toString());
      }

      FetcherOutput entry = new FetcherOutput();
      fetcher.get(docNo, entry);
      return entry;
    }

    public byte[] getContent(int docNo) throws IOException {
      if (content == null) {
        this.content = new ArrayFile.Reader
          (nfs, new File(segmentDir, Content.DIR_NAME).toString());
      }

      Content entry = new Content();
      content.get(docNo, entry);
      return entry.getContent();
    }

    public ParseData getParseData(int docNo) throws IOException {
      if (parsedata == null) {
        this.parsedata = new ArrayFile.Reader
          (nfs, new File(segmentDir, ParseData.DIR_NAME).toString());
      }
      
      ParseData entry = new ParseData();
      parsedata.get(docNo, entry);
      return entry;
    }

    public ParseText getParseText(int docNo) throws IOException {
    	synchronized(this){
    		if (parsetext == null) {
    			//System.out.println(">>>>>>>>>>>>> Open parse text file <<<<<<<<<<<<<");
    			this.parsetext = new ArrayFile.Reader
				(nfs, new File(segmentDir, ParseText.DIR_NAME).toString());
    		}
    	}
    	
    	ParseText entry = new ParseText();
    	parsetext.get(docNo, entry);
    	return entry;
    }
    // Add by Xie Shuqiang.2005-08-17
    public void close() throws IOException {
    	if (fetcher != null)
    		fetcher.close();
    	if (content != null)
    		content.close();
    	if (parsetext != null)
    		parsetext.close();
    	if (parsedata != null)
    		parsedata.close();
    }
    
  }

  private HashMap<String, Segment> segments = new HashMap<String, Segment>();
  
  public static final Logger LOG = Logger.getLogger("lock");

  /** Construct given a directory containing fetcher output. */
  public FetchedSegments(NutchFileSystem nfs, String segmentsDir) throws IOException {
	  
	  // ** segmentsDir 是 current today 一级的目录
    File[] segmentDirs = nfs.listFiles(new File(segmentsDir));

    if (segmentDirs != null) {
        for (int i = 0; i < segmentDirs.length; i++) {
            File segmentDir = segmentDirs[i];
            File indexdone = new File(segmentDir, IndexSegment.IDX_DONE_NAME);
            if (nfs.exists(indexdone) && nfs.isFile(indexdone)) {
            	segments.put(segmentDir.getName(), new Segment(nfs, segmentDir));
            }
        }
    }
    
    if(cache.getMaxSize() < maxSize){
  	  cache.setMaxSize(maxSize);
    }
  }
  
  public FetchedSegments(NutchFileSystem nfs, String[] segmentsDirs) throws IOException {
  	for (int i=0 ; i<segmentsDirs.length; i++){
  		File[] segmentDirs = nfs.listFiles(new File(segmentsDirs[i]));
  		
  		if (segmentDirs != null) {
  			for (int j = 0; j < segmentDirs.length; j++) {
  				File segmentDir = segmentDirs[j];
  				File indexdone = new File(segmentDir, IndexSegment.IDX_DONE_NAME);
  				if (nfs.exists(indexdone) && nfs.isFile(indexdone)) {
  					segments.put(segmentDir.getName(), new Segment(nfs, segmentDir));
  				}
  			}
  		}
  	}
  	
    if(cache.getMaxSize() < maxSize){
    	  cache.setMaxSize(maxSize);
    }  	
  }

  public void addSegment(NutchFileSystem nfs,String segmentName) throws IOException {
  	NutchFileSystem fs = nfs;
  	if (fs == null){
  		fs = new LocalFileSystem();
  	}
  	File segmentDir = new File(segmentName);
  	File indexdone = new File(segmentDir, IndexSegment.IDX_DONE_NAME);
  	if (fs.exists(indexdone) && fs.isFile(indexdone)) {
    	segments.put(segmentDir.getName(), new Segment(fs, segmentDir));
    }
  }
  
  // Add by Xie Shuqiang.2005-08-17
  public void delSegment(String segmentName) throws IOException {
  	if (segmentName == null || segmentName.length() == 0)
  		return;
  	File segmentDir = new File(segmentName);
  	Segment segment = (Segment)segments.remove(segmentDir.getName());
  	if (segment != null){
  		//System.out.println("close!!!");
  		segment.close();
  	}
  }
//Add by Xie Shuqiang.2005-08-17
  public void delSegments(String segmentName[]) throws IOException {
  	if (segmentName == null)
  		return;
  	for(int i=0; i<segmentName.length; i++)
  		delSegment(segmentName[i]);
  }
  
  public String[] getSegmentNames() {
    return (String[])segments.keySet().toArray(new String[segments.size()]);
  }

  public byte[] getContent(HitDetails details) throws IOException {
    return getSegment(details).getContent(getDocNo(details));
  }

  public ParseData getParseData(HitDetails details) throws IOException {
    return getSegment(details).getParseData(getDocNo(details));
  }

  public String[] getAnchors(HitDetails details) throws IOException {
    return getSegment(details).getFetcherOutput(getDocNo(details))
      .getFetchListEntry().getAnchors();
  }

  public long getFetchDate(HitDetails details) throws IOException {
    return getSegment(details).getFetcherOutput(getDocNo(details))
      .getFetchDate();
  }

  public ParseText getParseText(HitDetails details) throws IOException {
    return getSegment(details).getParseText(getDocNo(details));
  }

  public String getSummary(HitDetails details, Query query, boolean fullTextSummary)
  throws IOException {
	  int lock = SearcherLock.getLock();
	  int times = 0;
	  while (lock < 0) {
		  times++;
		  if (times > 20) {
			  throw new IOException("Can't get a lock!");
		  }
		  try {
			  Thread.sleep(100);
		  } catch (Exception e) {
			  throw new IOException(e.getMessage());
		  }
		  lock = SearcherLock.getLock();
	  }
	  LOG.info("getSummary(old): get lock :" + lock);
	  String text = "";
	  try{
		  Segment segment = getSegment(details);
		  int docNo = getDocNo(details);
		  ParseText ptext = segment.getParseText(docNo);
		  text = ptext.getText();
	  }catch(Exception e){
		  SearcherLock.unLock(lock);
		  LOG.info("getSummary(old): free lock :" + lock);
		  throw new IOException(e.getMessage());
	  }
	  SearcherLock.unLock(lock);
	  LOG.info("getSummary(old): free lock :" + lock);
	  if (!fullTextSummary)
		  return new Summarizer().getSummary(text, query).toString();
	  String title = details.getValue("title");
	  try{
		  int textLen = text.getBytes("GBK").length;
		  //String segTitle = WordsSegment.segment(title);
		  //String segText = WordsSegment.segment(text);
		  //String summary = FullTextSummary.summary(segTitle, segText);
		  String summary = FullTextSummary.summary(title, text);
		  int sumLen = summary.getBytes("GBK").length;
		  if (sumLen < 50 && textLen > 50){
			  LOG.warn("Summary Error:\ntitle:" + title + "\ntext:" + text + "\nsummary:" +summary);
		  }
		  return summary;
	  }catch(Exception e){
		  throw new IOException("Get Full Text Summary Error!" );
	  }
  }

  public String getSummaryNew(HitDetails details, Query query, int summaryType)
  throws IOException {
	  int lock = SearcherLock.getLock();
	  int times = 0;
	  while (lock < 0) {
		  times++;
		  if (times > 20) {
			  throw new IOException("Can't get a lock!");
		  }
		  try {
			  Thread.sleep(100);
		  } catch (Exception e) {
			  throw new IOException(e.getMessage());
		  }
		  lock = SearcherLock.getLock();
	  }
	  LOG.info("getSummary(new): get lock :" + lock);
	  String text = "";
	  try{
		  long docid = getGlobDocNo(details);
		  
		  CacheableObject cob = (CacheableObject)cache.get(docid);
		  if( cob == null || cob.getSize() == 0){
			  Segment segment = getSegment(details);
			  int docNo = getDocNo(details);
		  	  ParseText ptext = segment.getParseText(docNo);
		  	  text = ptext.getText();
		  	  if(docid >0)
		  		  Cache.add(docid, new CacheableObject(text));
		  	  //LOG.info("-------------------FetchedSegment get text form file");
		  }else{
			  text = (String)cob.getObject();
			  //LOG.info("------------------FetchedSegments get from cache docid="+docid+"  cache size="+cache.getMaxSize());
		  }
	  }catch(Exception e){
		  SearcherLock.unLock(lock);
		  LOG.info("getSummary(new e): free lock :" + lock + " e="+e);
		  throw new IOException(e.getMessage());
	  }
	  SearcherLock.unLock(lock);
	  LOG.info("getSummary(new): free lock :" + lock);
	  /*
		 * 摘要类型:0 不做摘要;1 关键词摘要;2 全文摘要 3 返回全文
		 * 默认关键词摘要
		 */
	  if (summaryType == 1)
		  return new Summarizer().getSummary(text, query).toString();
	  else if(summaryType == 3)
	  	  return StringUtils.trimAllWhitespace(text);
	  String title = details.getValue("title");
	  try{
		  int textLen = text.getBytes("GBK").length;
		  //String segTitle = WordsSegment.segment(title);
		  //String segText = WordsSegment.segment(text);
		  //String summary = FullTextSummary.summary(segTitle, segText);
		  String summary = FullTextSummary.summary(title, text);
		  int sumLen = summary.getBytes("GBK").length;
		  if (sumLen < 50 && textLen > 50){
			  LOG.warn("Summary Error:\ntitle:" + title + "\ntext:" + text + "\nsummary:" +summary);
		  }
		  return summary;
	  }catch(Exception e){
		  throw new IOException("Get Full Text Summary Error!" );
	  }
  }
  
  public String[] getSummary(HitDetails[] details, Query query, boolean fullTextSummary)
    throws IOException {
	  String[] results = new String[details.length];
	  for (int i = 0; i < details.length; i++)
		  results[i] = getSummary(details[i], query, fullTextSummary);
	  return results;
  }

  public String[] getSummaryNew(HitDetails[] details, Query query, int summaryType)
  throws IOException {
	  String[] results = new String[details.length];
	  for (int i = 0; i < details.length; i++)
		  results[i] = getSummaryNew(details[i], query, summaryType);
	  return results;
}
  
  private Segment getSegment(HitDetails details) {
    return (Segment)segments.get(details.getValue("segment"));
  }

  private int getDocNo(HitDetails details) {
    return Integer.parseInt(details.getValue("docNo"), 16);
  }

  private long getGlobDocNo(HitDetails details) {
	    return Long.parseLong(details.getValue("docid"), 10);
  }

public int getMaxSize() {
	return maxSize;
}

public void setMaxSize(int maxSize) {
	this.maxSize = maxSize;
}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -