📄 morelikethis.java
字号:
public void setAnalyzer(Analyzer analyzer) {
this.analyzer = analyzer;
}
/**
* Returns the frequency below which terms will be ignored in the source doc. The default
* frequency is the {@link #DEFAULT_MIN_TERM_FREQ}.
*
* @return the frequency below which terms will be ignored in the source doc.
*/
public int getMinTermFreq() {
return minTermFreq;
}
/**
* Sets the frequency below which terms will be ignored in the source doc.
*
* @param minTermFreq the frequency below which terms will be ignored in the source doc.
*/
public void setMinTermFreq(int minTermFreq) {
this.minTermFreq = minTermFreq;
}
/**
* Returns the frequency at which words will be ignored which do not occur in at least this
* many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}.
*
* @return the frequency at which words will be ignored which do not occur in at least this
* many docs.
*/
public int getMinDocFreq() {
return minDocFreq;
}
/**
* Sets the frequency at which words will be ignored which do not occur in at least this
* many docs.
*
* @param minDocFreq the frequency at which words will be ignored which do not occur in at
* least this many docs.
*/
public void setMinDocFreq(int minDocFreq) {
this.minDocFreq = minDocFreq;
}
/**
* Returns whether to boost terms in query based on "score" or not. The default is
* {@link #DEFAULT_BOOST}.
*
* @return whether to boost terms in query based on "score" or not.
* @see #setBoost
*/
public boolean isBoost() {
return boost;
}
/**
* Sets whether to boost terms in query based on "score" or not.
*
* @param boost true to boost terms in query based on "score", false otherwise.
* @see #isBoost
*/
public void setBoost(boolean boost) {
this.boost = boost;
}
/**
* Returns the field names that will be used when generating the 'More Like This' query.
* The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}.
*
* @return the field names that will be used when generating the 'More Like This' query.
*/
public String[] getFieldNames() {
return fieldNames;
}
/**
* Sets the field names that will be used when generating the 'More Like This' query.
* Set this to null for the field names to be determined at runtime from the IndexReader
* provided in the constructor.
*
* @param fieldNames the field names that will be used when generating the 'More Like This'
* query.
*/
public void setFieldNames(String[] fieldNames) {
this.fieldNames = fieldNames;
}
/**
* Returns the minimum word length below which words will be ignored. Set this to 0 for no
* minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}.
*
* @return the minimum word length below which words will be ignored.
*/
public int getMinWordLen() {
return minWordLen;
}
/**
* Sets the minimum word length below which words will be ignored.
*
* @param minWordLen the minimum word length below which words will be ignored.
*/
public void setMinWordLen(int minWordLen) {
this.minWordLen = minWordLen;
}
/**
* Returns the maximum word length above which words will be ignored. Set this to 0 for no
* maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}.
*
* @return the maximum word length above which words will be ignored.
*/
public int getMaxWordLen() {
return maxWordLen;
}
/**
* Sets the maximum word length above which words will be ignored.
*
* @param maxWordLen the maximum word length above which words will be ignored.
*/
public void setMaxWordLen(int maxWordLen) {
this.maxWordLen = maxWordLen;
}
/**
* Set the set of stopwords.
* Any word in this set is considered "uninteresting" and ignored.
* Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as
* for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
*
* @param stopWords set of stopwords, if null it means to allow stop words
*
* @see org.apache.lucene.analysis.StopFilter#makeStopSet StopFilter.makeStopSet()
* @see #getStopWords
*/
public void setStopWords(Set stopWords) {
this.stopWords = stopWords;
}
/**
* Get the current stop words being used.
* @see #setStopWords
*/
public Set getStopWords() {
return stopWords;
}
/**
* Returns the maximum number of query terms that will be included in any generated query.
* The default is {@link #DEFAULT_MAX_QUERY_TERMS}.
*
* @return the maximum number of query terms that will be included in any generated query.
*/
public int getMaxQueryTerms() {
return maxQueryTerms;
}
/**
* Sets the maximum number of query terms that will be included in any generated query.
*
* @param maxQueryTerms the maximum number of query terms that will be included in any
* generated query.
*/
public void setMaxQueryTerms(int maxQueryTerms) {
this.maxQueryTerms = maxQueryTerms;
}
/**
* @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
* @see #DEFAULT_MAX_NUM_TOKENS_PARSED
*/
public int getMaxNumTokensParsed()
{
return maxNumTokensParsed;
}
/**
* @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
*/
public void setMaxNumTokensParsed(int i)
{
maxNumTokensParsed = i;
}
/**
* Return a query that will return docs like the passed lucene document ID.
*
* @param docNum the documentID of the lucene doc to generate the 'More Like This" query for.
* @return a query that will return docs like the passed lucene document ID.
*/
public Query like(int docNum) throws IOException {
if (fieldNames == null) {
// gather list of valid fields from lucene
Collection fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED);
fieldNames = (String[]) fields.toArray(new String[fields.size()]);
}
return createQuery(retrieveTerms(docNum));
}
/**
* Return a query that will return docs like the passed file.
*
* @return a query that will return docs like the passed file.
*/
public Query like(File f) throws IOException {
if (fieldNames == null) {
// gather list of valid fields from lucene
Collection fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED);
fieldNames = (String[]) fields.toArray(new String[fields.size()]);
}
return like(new FileReader(f));
}
/**
* Return a query that will return docs like the passed URL.
*
* @return a query that will return docs like the passed URL.
*/
public Query like(URL u) throws IOException {
return like(new InputStreamReader(u.openConnection().getInputStream()));
}
/**
* Return a query that will return docs like the passed stream.
*
* @return a query that will return docs like the passed stream.
*/
public Query like(java.io.InputStream is) throws IOException {
return like(new InputStreamReader(is));
}
/**
* Return a query that will return docs like the passed Reader.
*
* @return a query that will return docs like the passed Reader.
*/
public Query like(Reader r) throws IOException {
return createQuery(retrieveTerms(r));
}
/**
* Create the More like query from a PriorityQueue
*/
private Query createQuery(PriorityQueue q) {
BooleanQuery query = new BooleanQuery();
Object cur;
int qterms = 0;
float bestScore = 0;
while (((cur = q.pop()) != null)) {
Object[] ar = (Object[]) cur;
TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));
if (boost) {
if (qterms == 0) {
bestScore = ((Float) ar[2]).floatValue();
}
float myScore = ((Float) ar[2]).floatValue();
tq.setBoost(myScore / bestScore);
}
try {
query.add(tq, BooleanClause.Occur.SHOULD);
}
catch (BooleanQuery.TooManyClauses ignore) {
break;
}
qterms++;
if (maxQueryTerms > 0 && qterms >= maxQueryTerms) {
break;
}
}
return query;
}
/**
* Create a PriorityQueue from a word->tf map.
*
* @param words a map of words keyed on the word(String) with Int objects as the values.
*/
private PriorityQueue createQueue(Map words) throws IOException {
// have collected all words in doc and their freqs
int numDocs = ir.numDocs();
FreqQ res = new FreqQ(words.size()); // will order words by score
Iterator it = words.keySet().iterator();
while (it.hasNext()) { // for every word
String word = (String) it.next();
int tf = ((Int) words.get(word)).x; // term freq in the source doc
if (minTermFreq > 0 && tf < minTermFreq) {
continue; // filter out words that don't occur enough times in the source
}
// go through all the fields and find the largest document frequency
String topField = fieldNames[0];
int docFreq = 0;
for (int i = 0; i < fieldNames.length; i++) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -