📄 cmshighlightfinder.java
字号:
/*
* File : $Source: /usr/local/cvs/opencms/src/org/opencms/search/documents/CmsHighlightFinder.java,v $
* Date : $Date: 2006/03/27 14:53:05 $
* Version: $Revision: 1.8 $
*
* This library is part of OpenCms -
* the Open Source Content Mananagement System
*
* Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
*
* This library is free software; you can redistribute it and/or
* modify it under the m_terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* For further information about Alkacon Software GmbH, please see the
* company website: http://www.alkacon.com
*
* For further information about OpenCms, please see the
* project website: http://www.opencms.org
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.opencms.search.documents;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.PriorityQueue;
/**
* Adapted from Maik Schreiber's LuceneTools.java,v 1.5 2001/10/16 07:25:55.
*
* Alterations include:
* + Changed to support Lucene 1.3 release (requires no change to Lucene code
* base but consequently no longer supports MultiTermQuery, RangeQuery and
* PrefixQuery highlighting currently)
* + Performance enhancement - CmsHighlightExtractor caches m_query m_terms and can
* therefore be called repeatedly to highlight multiple results more efficently
* + New feature: can extract the most relevant parts of large bodies of text -
* with user defined size of extracts
*
* @author Maik Schreiber
*/
public final class CmsHighlightFinder {
/** The analyzer. */
private Analyzer m_analyzer;
/** The term highlighter. */
private I_CmsTermHighlighter m_highlighter;
/** The query. */
private Query m_query;
/** A set of all terms. */
private HashSet m_terms = new HashSet();
/**
* @param highlighter
* I_TermHighlighter to use to highlight m_terms in the text
* @param query
* Query which contains the m_terms to be highlighted in the text
* @param analyzer
* Analyzer used to construct the Query
* @throws IOException if something goes wrong
*/
public CmsHighlightFinder(I_CmsTermHighlighter highlighter, Query query, Analyzer analyzer)
throws IOException {
this.m_highlighter = highlighter;
this.m_query = query;
this.m_analyzer = analyzer;
// get m_terms in m_query
getTerms(m_query, m_terms, false);
}
/**
* Extracts all term texts of a given Query. Term texts will be returned in
* lower-case.
*
* @param query
* Query to extract term texts from
* @param terms
* HashSet where extracted term texts should be put into
* (Elements: String)
* @param prohibited
* <code>true</code> to extract "prohibited" m_terms, too
* @throws IOException if something goes wrong
*/
public static void getTerms(Query query, HashSet terms, boolean prohibited) throws IOException {
if (query instanceof BooleanQuery) {
getTermsFromBooleanQuery((BooleanQuery)query, terms, prohibited);
} else if (query instanceof PhraseQuery) {
getTermsFromPhraseQuery((PhraseQuery)query, terms);
} else if (query instanceof TermQuery) {
getTermsFromTermQuery((TermQuery)query, terms);
}
}
/**
* Extracts all term texts of a given BooleanQuery. Term texts will be
* returned in lower-case.
*
* @param query
* BooleanQuery to extract term texts from
* @param terms
* HashSet where extracted term texts should be put into
* (Elements: String)
* @param prohibited
* <code>true</code> to extract "prohibited" m_terms, too
* @throws IOException if something goes wrong
*/
private static void getTermsFromBooleanQuery(BooleanQuery query, HashSet terms, boolean prohibited)
throws IOException {
BooleanClause[] queryClauses = query.getClauses();
int i;
for (i = 0; i < queryClauses.length; i++) {
if (prohibited || !queryClauses[i].isProhibited()) {
getTerms(queryClauses[i].getQuery(), terms, prohibited);
}
}
}
/**
* Extracts all term texts of a given PhraseQuery. Term texts will be
* returned in lower-case.
*
* @param query
* PhraseQuery to extract term texts from
* @param terms
* HashSet where extracted term texts should be put into
* (Elements: String)
*/
private static void getTermsFromPhraseQuery(PhraseQuery query, HashSet terms) {
Term[] queryTerms = query.getTerms();
int i;
for (i = 0; i < queryTerms.length; i++) {
terms.add(getTermsFromTerm(queryTerms[i]));
}
}
/**
* Extracts the term of a given Term. The term will be returned in
* lower-case.
*
* @param term
* Term to extract term from
*
* @return the Term's term text
*/
private static String getTermsFromTerm(Term term) {
return term.text().toLowerCase();
}
/**
* Extracts all term texts of a given TermQuery. Term texts will be
* returned in lower-case.
*
* @param query
* TermQuery to extract term texts from
* @param terms
* HashSet where extracted term texts should be put into
* (Elements: String)
*/
private static void getTermsFromTermQuery(TermQuery query, HashSet terms) {
terms.add(getTermsFromTerm(query.getTerm()));
}
/**
* Highlights a text in accordance to the given m_query, extracting the most
* relevant sections. The document text is analysed in fragmentSize chunks
* to record hit statistics across the document. After accumulating stats,
* the fragments with the highest scores are returned as an array of
* strings in order of m_score.
*
* @param text
* text to highlight m_terms in
* @param fragmentSize
* the size in bytes of each fragment to be returned
* @param maxNumFragments
* the maximum number of fragments.
*
* @return highlighted text fragments (between 0 and maxNumFragments number
* of fragments)
* @throws IOException if something goes wrong
*/
public String[] getBestFragments(String text, int fragmentSize, int maxNumFragments) throws IOException {
StringBuffer newText = new StringBuffer();
TokenStream stream = null;
ArrayList docFrags = new ArrayList();
DocumentFragment currentFrag = new DocumentFragment(newText.length(), docFrags.size());
docFrags.add(currentFrag);
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments + 1);
try {
org.apache.lucene.analysis.Token token;
String tokenText;
int startOffset;
int endOffset;
int lastEndOffset = 0;
// long start=System.currentTimeMillis();
stream = m_analyzer.tokenStream(null, new StringReader(text));
while ((token = stream.next()) != null) {
startOffset = token.startOffset();
endOffset = token.endOffset();
// make sure wildcards are removed else highlighting will not
// work
tokenText = text.substring(startOffset, endOffset);
// append text between end of last token (or beginning of text)
// and start of current token
if (startOffset > lastEndOffset) {
newText.append(" ");
// newText.append(text.substring(lastEndOffset, startOffset));
}
// does m_query contain current token?
if (m_terms.contains(token.termText())) {
newText.append(m_highlighter.highlightTerm(tokenText));
currentFrag.addTerm(token.termText());
} else {
if (tokenText.length() > fragmentSize / 2) {
newText.append(tokenText.substring(0, fragmentSize / 2));
newText.append(" ");
} else {
newText.append(tokenText);
}
}
if (newText.length() >= (fragmentSize * (docFrags.size() + 1))) {
//record stats for a new fragment
currentFrag.m_textEndPos = newText.length();
currentFrag = new DocumentFragment(newText.length(), docFrags.size());
docFrags.add(currentFrag);
}
lastEndOffset = endOffset;
}
// append text after end of last token
if (lastEndOffset < text.length()) {
// int extend = lastEndOffset + fragmentSize;
// extend = (extend > text.length()) ? text.length() : extend;
// newText.append(text.substring(lastEndOffset, extend));
newText.append(text.substring(lastEndOffset));
}
currentFrag.m_textEndPos = newText.length();
//find the most relevant sections of the text
int minScore = 0;
for (Iterator i = docFrags.iterator(); i.hasNext();) {
currentFrag = (DocumentFragment)i.next();
if (currentFrag.getScore() >= minScore) {
fragQueue.put(currentFrag);
if (fragQueue.size() > maxNumFragments) {
// if hit queue overfull
fragQueue.pop();
// remove lowest in hit queue
minScore = ((DocumentFragment)fragQueue.top()).getScore();
// reset minScore
}
}
}
//extract the text
String[] fragText = new String[fragQueue.size()];
for (int i = fragText.length - 1; i >= 0; i--) {
DocumentFragment frag = (DocumentFragment)fragQueue.pop();
fragText[i] = newText.substring(frag.m_textStartPos, frag.m_textEndPos);
}
return fragText;
} finally {
if (stream != null) {
try {
stream.close();
} catch (Exception e) {
// noop
}
}
}
}
/**
* Highlights a text in accordance to the given m_query and extracting the most
* relevant sections. The document text is analysed in
* fragmentSize chunks to record hit statistics across the document. After
* accumulating stats, the fragments with the highest scores are returned
* in order as "separator" delimited strings.
*
* @param text
* text to highlight m_terms in
* @param fragmentSize
* the size in bytes of each fragment to be returned
* @param maxNumFragments
* the maximum number of fragments.
* @param separator
* the separator used to intersperse the document fragments
* (typically " ... ")
*
* @return highlighted text
* @throws IOException if something goes wrong
*/
public String getBestFragments(String text, int fragmentSize, int maxNumFragments, String separator)
throws IOException {
String[] sections = getBestFragments(text, fragmentSize, maxNumFragments);
StringBuffer result = new StringBuffer();
for (int i = 0; i < sections.length; i++) {
if (i > 0) {
result.append(separator);
}
result.append(sections[i]);
}
return result.toString();
}
}
/**
* This class describes a fragment within a document. <p>
*
* @author Alexander Kandzior
*
* @version $Revision: 1.8 $
*
* @since 6.0.0
*/
class DocumentFragment {
/** The fragment number. */
protected int m_fragNum;
/** The score. */
protected int m_score;
/** The text end position .*/
protected int m_textEndPos;
/** The test start position. */
protected int m_textStartPos;
/** All unique terms found. */
protected HashSet m_uniqueTerms = new HashSet();
/**
* @param textStartPos textStartPos
* @param fragNum fragNum
*/
public DocumentFragment(int textStartPos, int fragNum) {
this.m_textStartPos = textStartPos;
this.m_fragNum = fragNum;
}
/**
* @param term term
*/
void addTerm(String term) {
m_uniqueTerms.add(term);
}
/**
* @return the score
*/
int getScore() {
return m_uniqueTerms.size();
}
}
/**
* This class implements a priority queue for document fragments.<p>
*/
class FragmentQueue extends PriorityQueue {
/**
* @param size size
*/
public FragmentQueue(int size) {
initialize(size);
}
/**
* @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
*/
public final boolean lessThan(Object a, Object b) {
DocumentFragment fragA = (DocumentFragment)a;
DocumentFragment fragB = (DocumentFragment)b;
if (fragA.getScore() == fragB.getScore()) {
return fragA.m_fragNum > fragB.m_fragNum;
} else {
return fragA.getScore() < fragB.getScore();
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -