📄 summarizer.java

📁 nutch搜索的改进型工具和优化爬虫的相关工具
💻 JAVA
字号:
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.searcher;

import java.io.*;
import java.util.*;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;

import net.nutch.searcher.Summary;
import net.nutch.searcher.Summary.*;
import net.nutch.searcher.Query;
import net.nutch.analysis.*;

import kit.nlp.util.*;

/** Implements hit summarization. */
public class Summarizer {
	
	/** The number of context terms to display preceding and following matches.*/
	private static final int SUM_CONTEXT = 6;
		//NutchConf.getInt("searcher.summary.context", 5);
	
	/** The total number of terms to display in a summary.*/
	private int SUM_LENGTH = 80;
		//NutchConf.getInt("searcher.summary.length", 20);
	
	/** Converts text to tokens. */
	private static final Analyzer ANALYZER = new ContentAnalyzer();
	
	/**
	 * Class Excerpt represents a single passage found in the
	 * document, with some appropriate regions highlit.
	 */
	class Excerpt {
		Vector passages = new Vector();
		SortedSet tokenSet = new TreeSet();
		/**
		 * Add by xie shuqiang
		 */
		SortedSet hightFreqTokenSet = new TreeSet();
		
		int numTerms = 0;
		
		/**
		 */
		public Excerpt() {
		}
		
		/**
		 */
		public void addToken(String token) {
			/*
			 * modifyed by xieshuqiang
			 */
			if (!Stopwords.isHighFreq(token))
				tokenSet.add(token);
			else{
				hightFreqTokenSet.add(token);
			}
		}
		
		/**
		 * Return how many unique toks we have
		 */
		public int numUniqueTokens() {
			/*
			 * modifyed by xie shuqiang
			 */
			return (tokenSet.size() * 4 + hightFreqTokenSet.size());
		}
		
		/**
		 * How many fragments we have.
		 */
		public int numFragments() {
			return passages.size();
		}
		
		public void setNumTerms(int numTerms) {
			this.numTerms = numTerms;
		}
		
		public int getNumTerms() {
			return numTerms;
		}
		
		/**
		 * Add a frag to the list.
		 */
		public void add(Fragment fragment) {
			passages.add(fragment);
		}
		
		/**
		 * Return an Enum for all the fragments
		 */
		public Enumeration elements() {
			return passages.elements();
		}
	}
	
	/** Returns a summary for the given pre-tokenized text. */
	public Summary getSummary(String text, Query query) throws IOException {
		
		// Simplistic implementation.  Finds the first fragments in the document
		// containing any query terms.
		//
		// TODO: check that phrases in the query are matched in the fragment
		SUM_LENGTH = query.getSummaryLen() > 0 ? query.getSummaryLen() : SUM_LENGTH;
		Token[] tokens = getTokens(text);             // parse text to token array

		if (tokens.length == 0)
			return new Summary();
		
		StringTokenizer token = new StringTokenizer(query.getQueryStr());
		ArrayList<String> termList = new ArrayList<String>();
		while(token.hasMoreTokens()){
			String term = token.nextToken();
			termList.add(term);
		}
		HashSet<String> highlight = new HashSet<String>();
		if (termList.size() > 0){
			String[] terms = (String[])termList.toArray(new String[termList.size()]);
			// put query terms in table
			for (int i = 0; i < terms.length; i++){
				StringTokenizer st = new StringTokenizer(terms[i],"/");
				while (st.hasMoreTokens()){
					highlight.add(st.nextToken());
				}
			}
		}
		
		//
		// Create a SortedSet that ranks excerpts according to
		// how many query terms are present.  An excerpt is
		// a Vector full of Fragments and Highlights
		//
		SortedSet excerptSet = new TreeSet(new Comparator() {
			public int compare(Object o1, Object o2) {
				Excerpt excerpt1 = (Excerpt) o1;
				Excerpt excerpt2 = (Excerpt) o2;
				
				if (excerpt1 == null && excerpt2 != null) {
					return -1;
				} else if (excerpt1 != null && excerpt2 == null) {
					return 1;
				} else if (excerpt1 == null && excerpt2 == null) {
					return 0;
				}
				
				int numToks1 = excerpt1.numUniqueTokens();
				int numToks2 = excerpt2.numUniqueTokens();
				
				if (numToks1 < numToks2) {
					return -1;
				} else if (numToks1 == numToks2) {
					return excerpt1.numFragments() - excerpt2.numFragments();
				} else {
					return 1;
				}
			}
		}
		);
		
		//
		// Iterate through all terms in the document
		//
		int lastExcerptPos = 0;
		for (int i = 0; i < tokens.length; i++) {
			//
			// If we find a term that's in the query...
			//
			if (highlight.contains(tokens[i].termText())) {
				//
				// Start searching at a point SUM_CONTEXT terms back,
				// and move SUM_CONTEXT terms into the future.
				//
				int startToken = (i > SUM_CONTEXT) ? i-SUM_CONTEXT : 0;
				int endToken = Math.min(i+SUM_CONTEXT, tokens.length);
				int offset = startToken;
				int j = startToken;
				
				// Iterate from the start point to the finish, adding
				// terms all the way.  The end of the passage is always
				// SUM_CONTEXT beyond the last query-term.
				//
				Excerpt excerpt = new Excerpt();
				//if (i != 0) {
				if( startToken > 0 ){
					excerpt.add(new Summary.Ellipsis());
				}
				
				//
				// Iterate through as long as we're before the end of
				// the document and we haven't hit the max-number-of-items
				// -in-a-summary.
				//
				int totalLength = 0;

				while (j < endToken) {
					//
					// Now grab the hit-element, if present
					//
					Token t = tokens[j];
					if (highlight.contains(t.termText())) {
						excerpt.addToken(t.termText());
						String temp = "";
			            int begin = offset;
			            boolean needSpace = false;
			            while ( begin < j ) {
			            	String termText = tokens[begin++].termText();
			            	boolean isCharString = termText.matches("[a-zA-Z0-9]+");
			            	if (needSpace && isCharString) temp += " ";
			            	if (isCharString){
			            		needSpace = true;
			            	}else
			            		needSpace = false;
			            	temp += termText;
			            }
			            excerpt.add(new Fragment(temp));
						excerpt.add(new Highlight(t.termText()));
						offset = j+1;
						endToken = Math.min(j+SUM_CONTEXT, tokens.length);
					}
					
					totalLength ++;
					if (totalLength > SUM_LENGTH){
						String temp = "";
			            int begin = offset;
			            boolean needSpace = false;
			            while ( begin <= j ) {
			            	String termText = tokens[begin++].termText();
			            	boolean isCharString = termText.matches("[a-zA-Z0-9]+");
			            	if (needSpace && isCharString) temp += " ";
			            	if (isCharString){
			            		needSpace = true;
			            	}else
			            		needSpace = false;
			            	temp += termText;
			            }
			            excerpt.add(new Fragment(temp));
						break;
					}
					
					j++;
				}
				
				lastExcerptPos = endToken;
				
				//
				// We found the series of search-term hits and added
				// them (with intervening text) to the excerpt.  Now 
				// we need to add the trailing edge of text.
				//
				// So if (j < tokens.length) then there is still trailing
				// text to add.  (We haven't hit the end of the source doc.)
				// Add the words since the last hit-term insert.
				//
				//if (j < tokens.length) {
				if (totalLength <= SUM_LENGTH){
					String temp = "";
		            int begin = offset;
		            boolean needSpace = false;
		            while ( begin <= endToken-1 ) {
		            	String termText = tokens[begin++].termText();
		            	boolean isCharString = termText.matches("[a-zA-Z0-9]+");
		            	if (needSpace && isCharString) temp += " ";
		            	if (isCharString){
		            		needSpace = true;
		            	}else
		            		needSpace = false;
		            	temp += termText;
		            }
		            excerpt.add(new Fragment(temp));
					//excerpt.add(new Fragment(text.substring(offset,tokens[endToken-1].endOffset())));
				}
				
				//
				// Remember how many terms are in this excerpt
				//
				excerpt.setNumTerms(j - startToken);
				
				//
				// Store the excerpt for later sorting
				//
				excerptSet.add(excerpt);
				
				//
				// Start SUM_CONTEXT places away.  The next
				// search for relevant excerpts begins at i-SUM_CONTEXT
				//
				i = j+SUM_CONTEXT;
			}
		}
		
		//
		// If the target text doesn't appear, then we just
		// excerpt the first SUM_LENGTH words from the document.
		//
		if (excerptSet.size() == 0) {
			Excerpt excerpt = new Excerpt();
			int excerptLen = Math.min(SUM_LENGTH, tokens.length);
			lastExcerptPos = excerptLen;
			
			String temp = "";
	        int begin = 0;
	        boolean needSpace = false;
	        while ( begin < excerptLen-1 ) {
	        	String termText = tokens[begin++].termText();
            	boolean isCharString = termText.matches("[a-zA-Z0-9]+");
            	if (needSpace && isCharString) temp += " ";
            	if (isCharString){
            		needSpace = true;
            	}else
            		needSpace = false;
            	temp += termText;
	        }
	        excerpt.add(new Fragment(temp));
			excerpt.setNumTerms(excerptLen);
			excerptSet.add(excerpt);
		}
		
		//
		// Now choose the best items from the excerpt set.
		// Stop when our Summary grows too large.
		//
		double tokenCount = 0;
		Summary s = new Summary();
		while (tokenCount <= SUM_LENGTH && excerptSet.size() > 0) {
			Excerpt excerpt = (Excerpt) excerptSet.last();
			excerptSet.remove(excerpt);
			
			double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
			for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
				Fragment f = (Fragment) e.nextElement();
				// Don't add fragments if it takes us over the max-limit
				if (tokenCount + tokenFraction <= SUM_LENGTH) {
					s.add(f);
				}
				tokenCount += tokenFraction;
			}
		}
		
		if (tokenCount > 0 && lastExcerptPos < tokens.length)
			s.add(new Ellipsis());
		return s;
	}
	
	private Token[] getTokens(String text) throws IOException {
		ArrayList<Token> result = new ArrayList<Token>();
		/*
		ContentTokenStream ts = (ContentTokenStream)ANALYZER.tokenStream("content", new StringReader(text));
		for (Token token = ts.next(false); token != null; token = ts.next(false)) {
			if (token.type().equals("word"))
				result.add(token);
		}
		*/
		StringTokenizer st = new StringTokenizer(text," ");
		int offset = 0;
		while( st.hasMoreTokens()){
			String token = st.nextToken();
			result.add(new Token(token,offset,offset+token.length()));
		}
		return (Token[])result.toArray(new Token[result.size()]);
	}
	
	/**
	 * Tests Summary-generation.  User inputs the name of a 
	 * text file and a query string
	 */
	public static void main(String argv[]) throws IOException {
		// Test arglist
		if (argv.length < 2) {
			System.out.println("Usage: kit.nlp.search.searcher.Summarizer <textfile> <queryStr>");
			return;
		}
		
		Summarizer s = new Summarizer();
		
		//
		// Parse the args
		//
		File textFile = new File(argv[0]);
		StringBuffer queryBuf = new StringBuffer();
		for (int i = 1; i < argv.length; i++) {
			queryBuf.append(argv[i]);
			queryBuf.append(" ");
		}
		
		//
		// Load the text file into a single string.
		//
		StringBuffer body = new StringBuffer();
		BufferedReader in = new BufferedReader(new FileReader(textFile));
		try {
			System.out.println("About to read " + textFile + " from " + in);
			String str = in.readLine();
			while (str != null) {
				body.append(str);
				str = in.readLine();
			}
		} finally {
			in.close();
		}
		String content = body.toString();
		System.out.println("content:" + content);
		// Convert the query string into a proper Query
		try{
			content = WordsSegment.segment(content,' ');
			System.out.println("content seg:" + content);
			Query query = SearchQuery.parse(queryBuf.toString(),0,0);
			System.out.println("Summary: '" + s.getSummary(content, query) + "'");
		}catch(Exception e){
			System.out.println("Get summary Error!" + e.getMessage());
			e.printStackTrace();
		}
	}
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -