📄 morelikethis.java

📁 lucene2.2.0版本
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
    public void setAnalyzer(Analyzer analyzer) {
        this.analyzer = analyzer;
    }

    /**
     * Returns the frequency below which terms will be ignored in the source doc. The default
     * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}.
     *
     * @return the frequency below which terms will be ignored in the source doc.
     */
    public int getMinTermFreq() {
        return minTermFreq;
    }

    /**
     * Sets the frequency below which terms will be ignored in the source doc.
     *
     * @param minTermFreq the frequency below which terms will be ignored in the source doc.
     */
    public void setMinTermFreq(int minTermFreq) {
        this.minTermFreq = minTermFreq;
    }

    /**
     * Returns the frequency at which words will be ignored which do not occur in at least this
     * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}.
     *
     * @return the frequency at which words will be ignored which do not occur in at least this
     * many docs.
     */
    public int getMinDocFreq() {
        return minDocFreq;
    }

    /**
     * Sets the frequency at which words will be ignored which do not occur in at least this
     * many docs.
     *
     * @param minDocFreq the frequency at which words will be ignored which do not occur in at
     * least this many docs.
     */
    public void setMinDocFreq(int minDocFreq) {
        this.minDocFreq = minDocFreq;
    }

    /**
     * Returns whether to boost terms in query based on "score" or not. The default is
     * {@link #DEFAULT_BOOST}.
     *
     * @return whether to boost terms in query based on "score" or not.
	 * @see #setBoost
     */
    public boolean isBoost() {
        return boost;
    }

    /**
     * Sets whether to boost terms in query based on "score" or not.
     *
     * @param boost true to boost terms in query based on "score", false otherwise.
	 * @see #isBoost
     */
    public void setBoost(boolean boost) {
        this.boost = boost;
    }

    /**
     * Returns the field names that will be used when generating the 'More Like This' query.
     * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}.
     *
     * @return the field names that will be used when generating the 'More Like This' query.
     */
    public String[] getFieldNames() {
        return fieldNames;
    }

    /**
     * Sets the field names that will be used when generating the 'More Like This' query.
     * Set this to null for the field names to be determined at runtime from the IndexReader
     * provided in the constructor.
     *
     * @param fieldNames the field names that will be used when generating the 'More Like This'
     * query.
     */
    public void setFieldNames(String[] fieldNames) {
        this.fieldNames = fieldNames;
    }

    /**
     * Returns the minimum word length below which words will be ignored. Set this to 0 for no
     * minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}.
     *
     * @return the minimum word length below which words will be ignored.
     */
    public int getMinWordLen() {
        return minWordLen;
    }

    /**
     * Sets the minimum word length below which words will be ignored.
     *
     * @param minWordLen the minimum word length below which words will be ignored.
     */
    public void setMinWordLen(int minWordLen) {
        this.minWordLen = minWordLen;
    }

    /**
     * Returns the maximum word length above which words will be ignored. Set this to 0 for no
     * maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}.
     *
     * @return the maximum word length above which words will be ignored.
     */
    public int getMaxWordLen() {
        return maxWordLen;
    }

    /**
     * Sets the maximum word length above which words will be ignored.
     *
     * @param maxWordLen the maximum word length above which words will be ignored.
     */
    public void setMaxWordLen(int maxWordLen) {
        this.maxWordLen = maxWordLen;
    }

	/**
	 * Set the set of stopwords.
	 * Any word in this set is considered "uninteresting" and ignored.
	 * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as
	 * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
	 * 
	 * @param stopWords set of stopwords, if null it means to allow stop words
	 *
	 * @see org.apache.lucene.analysis.StopFilter#makeStopSet StopFilter.makeStopSet()
	 * @see #getStopWords	 
	 */
	public void setStopWords(Set stopWords) {
		this.stopWords = stopWords;
	}

	/**
	 * Get the current stop words being used.
	 * @see #setStopWords
	 */
	public Set getStopWords() {
		return stopWords;
	}
		

    /**
     * Returns the maximum number of query terms that will be included in any generated query.
     * The default is {@link #DEFAULT_MAX_QUERY_TERMS}.
     *
     * @return the maximum number of query terms that will be included in any generated query.
     */
    public int getMaxQueryTerms() {
        return maxQueryTerms;
    }

    /**
     * Sets the maximum number of query terms that will be included in any generated query.
     *
     * @param maxQueryTerms the maximum number of query terms that will be included in any
     * generated query.
     */
    public void setMaxQueryTerms(int maxQueryTerms) {
        this.maxQueryTerms = maxQueryTerms;
    }

	/**
	 * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
	 * @see #DEFAULT_MAX_NUM_TOKENS_PARSED
	 */
	public int getMaxNumTokensParsed()
	{
		return maxNumTokensParsed;
	}

	/**
	 * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
	 */
	public void setMaxNumTokensParsed(int i)
	{
		maxNumTokensParsed = i;
	}




    /**
     * Return a query that will return docs like the passed lucene document ID.
     *
     * @param docNum the documentID of the lucene doc to generate the 'More Like This" query for.
     * @return a query that will return docs like the passed lucene document ID.
     */
    public Query like(int docNum) throws IOException {
        if (fieldNames == null) {
            // gather list of valid fields from lucene
            Collection fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED);
            fieldNames = (String[]) fields.toArray(new String[fields.size()]);
        }

        return createQuery(retrieveTerms(docNum));
    }

    /**
     * Return a query that will return docs like the passed file.
     *
     * @return a query that will return docs like the passed file.
     */
    public Query like(File f) throws IOException {
        if (fieldNames == null) {
            // gather list of valid fields from lucene
            Collection fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED);
            fieldNames = (String[]) fields.toArray(new String[fields.size()]);
        }

        return like(new FileReader(f));
    }

    /**
     * Return a query that will return docs like the passed URL.
     *
     * @return a query that will return docs like the passed URL.
     */
    public Query like(URL u) throws IOException {
        return like(new InputStreamReader(u.openConnection().getInputStream()));
    }

    /**
     * Return a query that will return docs like the passed stream.
     *
     * @return a query that will return docs like the passed stream.
     */
    public Query like(java.io.InputStream is) throws IOException {
        return like(new InputStreamReader(is));
    }

    /**
     * Return a query that will return docs like the passed Reader.
     *
     * @return a query that will return docs like the passed Reader.
     */
    public Query like(Reader r) throws IOException {
        return createQuery(retrieveTerms(r));
    }

    /**
     * Create the More like query from a PriorityQueue
     */
    private Query createQuery(PriorityQueue q) {
        BooleanQuery query = new BooleanQuery();
        Object cur;
        int qterms = 0;
        float bestScore = 0;

        while (((cur = q.pop()) != null)) {
            Object[] ar = (Object[]) cur;
            TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));

            if (boost) {
                if (qterms == 0) {
                    bestScore = ((Float) ar[2]).floatValue();
                }
                float myScore = ((Float) ar[2]).floatValue();

                tq.setBoost(myScore / bestScore);
            }

            try {
                query.add(tq, BooleanClause.Occur.SHOULD);
            }
            catch (BooleanQuery.TooManyClauses ignore) {
                break;
            }

            qterms++;
            if (maxQueryTerms > 0 && qterms >= maxQueryTerms) {
                break;
            }
        }

        return query;
    }

    /**
     * Create a PriorityQueue from a word->tf map.
     *
     * @param words a map of words keyed on the word(String) with Int objects as the values.
     */
    private PriorityQueue createQueue(Map words) throws IOException {
        // have collected all words in doc and their freqs
        int numDocs = ir.numDocs();
        FreqQ res = new FreqQ(words.size()); // will order words by score

        Iterator it = words.keySet().iterator();
        while (it.hasNext()) { // for every word
            String word = (String) it.next();

            int tf = ((Int) words.get(word)).x; // term freq in the source doc
            if (minTermFreq > 0 && tf < minTermFreq) {
                continue; // filter out words that don't occur enough times in the source
            }

            // go through all the fields and find the largest document frequency
            String topField = fieldNames[0];
            int docFreq = 0;
            for (int i = 0; i < fieldNames.length; i++) {
💿 文件大小 5913 K
👤 上传用户 jjjjjkkkkjkjkjk
📂 所属分类 Java编程
🏷️ 相关标签

#lucene #版本
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -