📄 morelikethis.java
字号:
int freq = ir.docFreq(new Term(fieldNames[i], word));
topField = (freq > docFreq) ? fieldNames[i] : topField;
docFreq = (freq > docFreq) ? freq : docFreq;
}
if (minDocFreq > 0 && docFreq < minDocFreq) {
continue; // filter out words that don't occur in enough docs
}
if (docFreq == 0) {
continue; // index update problem?
}
float idf = similarity.idf(docFreq, numDocs);
float score = tf * idf;
// only really need 1st 3 entries, other ones are for troubleshooting
res.insert(new Object[]{word, // the word
topField, // the top field
new Float(score), // overall score
new Float(idf), // idf
new Integer(docFreq), // freq in all docs
new Integer(tf)
});
}
return res;
}
/**
* Describe the parameters that control how the "more like this" query is formed.
*/
public String describeParams() {
StringBuffer sb = new StringBuffer();
sb.append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n");
sb.append("\t" + "minWordLen : " + minWordLen + "\n");
sb.append("\t" + "maxWordLen : " + maxWordLen + "\n");
sb.append("\t" + "fieldNames : \"");
String delim = "";
for (int i = 0; i < fieldNames.length; i++) {
String fieldName = fieldNames[i];
sb.append(delim).append(fieldName);
delim = ", ";
}
sb.append("\n");
sb.append("\t" + "boost : " + boost + "\n");
sb.append("\t" + "minTermFreq : " + minTermFreq + "\n");
sb.append("\t" + "minDocFreq : " + minDocFreq + "\n");
return sb.toString();
}
/**
* Test driver.
* Pass in "-i INDEX" and then either "-fn FILE" or "-url URL".
*/
public static void main(String[] a) throws Throwable {
String indexName = "localhost_index";
String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
URL url = null;
for (int i = 0; i < a.length; i++) {
if (a[i].equals("-i")) {
indexName = a[++i];
}
else if (a[i].equals("-f")) {
fn = a[++i];
}
else if (a[i].equals("-url")) {
url = new URL(a[++i]);
}
}
PrintStream o = System.out;
IndexReader r = IndexReader.open(indexName);
o.println("Open index " + indexName + " which has " + r.numDocs() + " docs");
MoreLikeThis mlt = new MoreLikeThis(r);
o.println("Query generation parameters:");
o.println(mlt.describeParams());
o.println();
Query query = null;
if (url != null) {
o.println("Parsing URL: " + url);
query = mlt.like(url);
}
else if (fn != null) {
o.println("Parsing file: " + fn);
query = mlt.like(new File(fn));
}
o.println("q: " + query);
o.println();
IndexSearcher searcher = new IndexSearcher(indexName);
Hits hits = searcher.search(query);
int len = hits.length();
o.println("found: " + len + " documents matching");
o.println();
for (int i = 0; i < Math.min(25, len); i++) {
Document d = hits.doc(i);
String summary = d.get( "summary");
o.println("score : " + hits.score(i));
o.println("url : " + d.get("url"));
o.println("\ttitle : " + d.get("title"));
if ( summary != null)
o.println("\tsummary: " + d.get("summary"));
o.println();
}
}
/**
* Find words for a more-like-this query former.
*
* @param docNum the id of the lucene document from which to find terms
*/
private PriorityQueue retrieveTerms(int docNum) throws IOException {
Map termFreqMap = new HashMap();
for (int i = 0; i < fieldNames.length; i++) {
String fieldName = fieldNames[i];
TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName);
// field does not store term vector info
if (vector == null) {
Document d=ir.document(docNum);
String text[]=d.getValues(fieldName);
if(text!=null)
{
for (int j = 0; j < text.length; j++) {
addTermFrequencies(new StringReader(text[j]), termFreqMap, fieldName);
}
}
}
else {
addTermFrequencies(termFreqMap, vector);
}
}
return createQueue(termFreqMap);
}
/**
* Adds terms and frequencies found in vector into the Map termFreqMap
* @param termFreqMap a Map of terms and their frequencies
* @param vector List of terms and their frequencies for a doc/field
*/
private void addTermFrequencies(Map termFreqMap, TermFreqVector vector)
{
String[] terms = vector.getTerms();
int freqs[]=vector.getTermFrequencies();
for (int j = 0; j < terms.length; j++) {
String term = terms[j];
if(isNoiseWord(term)){
continue;
}
// increment frequency
Int cnt = (Int) termFreqMap.get(term);
if (cnt == null) {
cnt=new Int();
termFreqMap.put(term, cnt);
cnt.x=freqs[j];
}
else {
cnt.x+=freqs[j];
}
}
}
/**
* Adds term frequencies found by tokenizing text from reader into the Map words
* @param r a source of text to be tokenized
* @param termFreqMap a Map of terms and their frequencies
* @param fieldName Used by analyzer for any special per-field analysis
*/
private void addTermFrequencies(Reader r, Map termFreqMap, String fieldName)
throws IOException
{
TokenStream ts = analyzer.tokenStream(fieldName, r);
org.apache.lucene.analysis.Token token;
int tokenCount=0;
while ((token = ts.next()) != null) { // for every token
String word = token.termText();
tokenCount++;
if(tokenCount>maxNumTokensParsed)
{
break;
}
if(isNoiseWord(word)){
continue;
}
// increment frequency
Int cnt = (Int) termFreqMap.get(word);
if (cnt == null) {
termFreqMap.put(word, new Int());
}
else {
cnt.x++;
}
}
}
/** determines if the passed term is likely to be of interest in "more like" comparisons
*
* @param term The word being considered
* @return true if should be ignored, false if should be used in further analysis
*/
private boolean isNoiseWord(String term)
{
int len = term.length();
if (minWordLen > 0 && len < minWordLen) {
return true;
}
if (maxWordLen > 0 && len > maxWordLen) {
return true;
}
if (stopWords != null && stopWords.contains( term)) {
return true;
}
return false;
}
/**
* Find words for a more-like-this query former.
* The result is a priority queue of arrays with one entry for <b>every word</b> in the document.
* Each array has 6 elements.
* The elements are:
* <ol>
* <li> The word (String)
* <li> The top field that this word comes from (String)
* <li> The score for this word (Float)
* <li> The IDF value (Float)
* <li> The frequency of this word in the index (Integer)
* <li> The frequency of this word in the source document (Integer)
* </ol>
* This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
* This method is exposed so that you can identify the "interesting words" in a document.
* For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
*
* @param r the reader that has the content of the document
* @return the most intresting words in the document ordered by score, with the highest scoring, or best entry, first
*
* @see #retrieveInterestingTerms
*/
public PriorityQueue retrieveTerms(Reader r) throws IOException {
Map words = new HashMap();
for (int i = 0; i < fieldNames.length; i++) {
String fieldName = fieldNames[i];
addTermFrequencies(r, words, fieldName);
}
return createQueue(words);
}
/**
* Convenience routine to make it easy to return the most interesting words in a document.
* More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly.
* @param r the source document
* @return the most interesting words in the document
*
* @see #retrieveTerms(java.io.Reader)
* @see #setMaxQueryTerms
*/
public String[] retrieveInterestingTerms( Reader r) throws IOException {
ArrayList al = new ArrayList( maxQueryTerms);
PriorityQueue pq = retrieveTerms( r);
Object cur;
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
// we just want to return the top words
while (((cur = pq.pop()) != null) && lim-- > 0) {
Object[] ar = (Object[]) cur;
al.add( ar[ 0]); // the 1st entry is the interesting word
}
String[] res = new String[ al.size()];
return (String[]) al.toArray( res);
}
/**
* PriorityQueue that orders words by score.
*/
private static class FreqQ extends PriorityQueue {
FreqQ (int s) {
initialize(s);
}
protected boolean lessThan(Object a, Object b) {
Object[] aa = (Object[]) a;
Object[] bb = (Object[]) b;
Float fa = (Float) aa[2];
Float fb = (Float) bb[2];
return fa.floatValue() > fb.floatValue();
}
}
/**
* Use for frequencies and to avoid renewing Integers.
*/
private static class Int {
int x;
Int() {
x = 1;
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -