📄 highlighter.java

📁 lucene2.2.0版本
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package org.apache.lucene.search.highlight;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.IOException;import java.io.StringReader;import java.util.ArrayList;import java.util.Iterator;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.util.PriorityQueue;/** * Class used to markup highlighted terms found in the best sections of a * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}, * {@link Encoder} and tokenizers. * @author mark@searcharea.co.uk */public class Highlighter{	public static final  int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;	private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;	private Formatter formatter;	private Encoder encoder;	private Fragmenter textFragmenter=new SimpleFragmenter();	private Scorer fragmentScorer=null;	public Highlighter(Scorer fragmentScorer)	{		this(new SimpleHTMLFormatter(),fragmentScorer);	} 	public Highlighter(Formatter formatter, Scorer fragmentScorer) 	{		this(formatter,new DefaultEncoder(),fragmentScorer);	}	public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)	{ 		this.formatter = formatter;		this.encoder = encoder; 		this.fragmentScorer = fragmentScorer; 	}	/**	 * Highlights chosen terms in a text, extracting the most relevant section.	 * This is a convenience method that calls	 * {@link #getBestFragment(TokenStream, String)}	 *	 * @param analyzer   the analyzer that will be used to split <code>text</code>	 * into chunks	 * @param text text to highlight terms in	 * @param fieldName Name of field used to influence analyzer's tokenization policy	 *	 * @return highlighted text fragment or null if no terms found	 */	public final String getBestFragment(Analyzer analyzer, String fieldName,String text)		throws IOException	{		TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));		return getBestFragment(tokenStream, text);	}	/**	 * Highlights chosen terms in a text, extracting the most relevant section.	 * The document text is analysed in chunks to record hit statistics	 * across the document. After accumulating stats, the fragment with the highest score	 * is returned	 *	 * @param tokenStream   a stream of tokens identified in the text parameter, including offset information.	 * This is typically produced by an analyzer re-parsing a document's	 * text. Some work may be done on retrieving TokenStreams more efficently	 * by adding support for storing original text position data in the Lucene	 * index but this support is not currently available (as of Lucene 1.4 rc2).	 * @param text text to highlight terms in	 *	 * @return highlighted text fragment or null if no terms found	 */	public final String getBestFragment(TokenStream tokenStream, String text)		throws IOException	{		String[] results = getBestFragments(tokenStream,text, 1);		if (results.length > 0)		{			return results[0];		}		return null;	}	/**	 * Highlights chosen terms in a text, extracting the most relevant sections.	 * This is a convenience method that calls	 * {@link #getBestFragments(TokenStream, String, int)}	 *	 * @param analyzer   the analyzer that will be used to split <code>text</code>	 * into chunks	 * @param text        	text to highlight terms in	 * @param maxNumFragments  the maximum number of fragments.	 * @deprecated This method incorrectly hardcodes the choice of fieldname. Use the	 * method of the same name that takes a fieldname.	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)	 */	public final String[] getBestFragments(		Analyzer analyzer,		String text,		int maxNumFragments)		throws IOException	{		TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text));		return getBestFragments(tokenStream, text, maxNumFragments);	}	/**	 * Highlights chosen terms in a text, extracting the most relevant sections.	 * This is a convenience method that calls	 * {@link #getBestFragments(TokenStream, String, int)}	 *	 * @param analyzer   the analyzer that will be used to split <code>text</code>	 * into chunks	 * @param fieldName     the name of the field being highlighted (used by analyzer)	 * @param text        	text to highlight terms in	 * @param maxNumFragments  the maximum number of fragments.	 *	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)	 */	public final String[] getBestFragments(		Analyzer analyzer,		String fieldName,		String text,		int maxNumFragments)		throws IOException	{		TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));		return getBestFragments(tokenStream, text, maxNumFragments);	}	/**	 * Highlights chosen terms in a text, extracting the most relevant sections.	 * The document text is analysed in chunks to record hit statistics	 * across the document. After accumulating stats, the fragments with the highest scores	 * are returned as an array of strings in order of score (contiguous fragments are merged into	 * one in their original order to improve readability)	 *	 * @param text        	text to highlight terms in	 * @param maxNumFragments  the maximum number of fragments.	 *	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)	 */	public final String[] getBestFragments(		TokenStream tokenStream,		String text,		int maxNumFragments)		throws IOException	{		maxNumFragments = Math.max(1, maxNumFragments); //sanity check		TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);		//Get text		ArrayList fragTexts = new ArrayList();		for (int i = 0; i < frag.length; i++)		{			if ((frag[i] != null) && (frag[i].getScore() > 0))			{				fragTexts.add(frag[i].toString());			}		}		return (String[]) fragTexts.toArray(new String[0]);	}	/**	 * Low level api to get the most relevant (formatted) sections of the document.	 * This method has been made public to allow visibility of score information held in TextFragment objects.	 * Thanks to Jason Calabrese for help in redefining the interface.	 * @param tokenStream	 * @param text	 * @param maxNumFragments	 * @param mergeContiguousFragments	 * @throws IOException	 */	public final TextFragment[] getBestTextFragments(		TokenStream tokenStream,		String text,		boolean mergeContiguousFragments,		int maxNumFragments)		throws IOException	{		ArrayList docFrags = new ArrayList();		StringBuffer newText=new StringBuffer();		TextFragment currentFrag =	new TextFragment(newText,newText.length(), docFrags.size());		fragmentScorer.startFragment(currentFrag);		docFrags.add(currentFrag);		FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);		try		{			org.apache.lucene.analysis.Token token;			String tokenText;			int startOffset;			int endOffset;			int lastEndOffset = 0;			textFragmenter.start(text);			TokenGroup tokenGroup=new TokenGroup();			token = tokenStream.next();			while ((token!= null)&&(token.startOffset()<maxDocBytesToAnalyze))			{				if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))				{					//the current token is distinct from previous tokens -					// markup the cached token group info					startOffset = tokenGroup.matchStartOffset;					endOffset = tokenGroup.matchEndOffset;					tokenText = text.substring(startOffset, endOffset);					String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);					//store any whitespace etc from between this and last group					if (startOffset > lastEndOffset)						newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));					newText.append(markedUpText);					lastEndOffset=Math.max(endOffset, lastEndOffset);					tokenGroup.clear();					//check if current token marks the start of a new fragment					if(textFragmenter.isNewFragment(token))					{						currentFrag.setScore(fragmentScorer.getFragmentScore());						//record stats for a new fragment						currentFrag.textEndPos = newText.length();						currentFrag =new TextFragment(newText, newText.length(), docFrags.size());						fragmentScorer.startFragment(currentFrag);						docFrags.add(currentFrag);					}				}				tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));//				if(lastEndOffset>maxDocBytesToAnalyze)//				{//					break;//				}				token = tokenStream.next();			}			currentFrag.setScore(fragmentScorer.getFragmentScore());			if(tokenGroup.numTokens>0)			{				//flush the accumulated text (same code as in above loop)				startOffset = tokenGroup.matchStartOffset;				endOffset = tokenGroup.matchEndOffset;				tokenText = text.substring(startOffset, endOffset);				String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);				//store any whitespace etc from between this and last group				if (startOffset > lastEndOffset)
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -