📄 similarity.java

📁 中文分词,中科院分词的改装版。使用java调用dll来实现的。
💻 JAVA
字号:
package org.apache.lucene.search;/** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.IOException;import java.io.Serializable;import java.util.Collection;import java.util.Iterator;import org.apache.lucene.index.Term;import org.apache.lucene.index.IndexReader;       // for javadocimport org.apache.lucene.index.IndexWriter;       // for javadocimport org.apache.lucene.document.Field;          // for javadocimport org.apache.lucene.util.SmallFloat;/** Expert: Scoring API. * <p>Subclasses implement search scoring. * * <p>The score of query <code>q</code> for document <code>d</code> is defined * in terms of these methods as follows: * * <table cellpadding="0" cellspacing="0" border="0"> *  <tr> *    <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td> *    <td valign="middle" align="center"> *    <big><big><big><big><big>&Sigma;</big></big></big></big></big></td> *    <td valign="middle"><small> *    ( {@link #tf(int) tf}(t in d) * *    {@link #idf(Term,Searcher) idf}(t)^2 * *    {@link Query#getBoost getBoost}(t in q) * *    {@link Field#getBoost getBoost}(t.field in d) * *    {@link #lengthNorm(String,int) lengthNorm}(t.field in d) ) *    </small></td> *    <td valign="middle" rowspan="2">&nbsp;* *    {@link #coord(int,int) coord}(q,d) * *    {@link #queryNorm(float) queryNorm}(sumOfSqaredWeights) *    </td> *  </tr> *  <tr> *   <td valign="top" align="right"> *    <small>t in q</small> *    </td> *  </tr> * </table> *  * <p> where *  * <table cellpadding="0" cellspacing="0" border="0"> *  <tr> *    <td valign="middle" align="right" rowspan="2">sumOfSqaredWeights =<br></td> *    <td valign="middle" align="center"> *    <big><big><big><big><big>&Sigma;</big></big></big></big></big></td> *    <td valign="middle"><small> *    ( {@link #idf(Term,Searcher) idf}(t) * *    {@link Query#getBoost getBoost}(t in q) )^2 *    </small></td> *  </tr> *  <tr> *   <td valign="top" align="right"> *    <small>t in q</small> *    </td> *  </tr> * </table> *  * <p> Note that the above formula is motivated by the cosine-distance or dot-product * between document and query vector, which is implemented by {@link DefaultSimilarity}. * * @see #setDefault(Similarity) * @see IndexWriter#setSimilarity(Similarity) * @see Searcher#setSimilarity(Similarity) */public abstract class Similarity implements Serializable {  /** The Similarity implementation used by default. */  private static Similarity defaultImpl = new DefaultSimilarity();  /** Set the default Similarity implementation used by indexing and search   * code.   *   * @see Searcher#setSimilarity(Similarity)   * @see IndexWriter#setSimilarity(Similarity)   */  public static void setDefault(Similarity similarity) {    Similarity.defaultImpl = similarity;  }  /** Return the default Similarity implementation used by indexing and search   * code.   *   * <p>This is initially an instance of {@link DefaultSimilarity}.   *   * @see Searcher#setSimilarity(Similarity)   * @see IndexWriter#setSimilarity(Similarity)   */  public static Similarity getDefault() {    return Similarity.defaultImpl;  }  /** Cache of decoded bytes. */  private static final float[] NORM_TABLE = new float[256];  static {    for (int i = 0; i < 256; i++)      NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);  }  /** Decodes a normalization factor stored in an index.   * @see #encodeNorm(float)   */  public static float decodeNorm(byte b) {    return NORM_TABLE[b & 0xFF];  // & 0xFF maps negative bytes to positive above 127  }  /** Returns a table for decoding normalization bytes.   * @see #encodeNorm(float)   */  public static float[] getNormDecoder() {    return NORM_TABLE;  }  /** Computes the normalization value for a field given the total number of   * terms contained in a field.  These values, together with field boosts, are   * stored in an index and multipled into scores for hits on each field by the   * search code.   *   * <p>Matches in longer fields are less precise, so implementations of this   * method usually return smaller values when <code>numTokens</code> is large,   * and larger values when <code>numTokens</code> is small.   *   * <p>That these values are computed under {@link   * IndexWriter#addDocument(org.apache.lucene.document.Document)} and stored then using   * {@link #encodeNorm(float)}.  Thus they have limited precision, and documents   * must be re-indexed if this method is altered.   *   * @param fieldName the name of the field   * @param numTokens the total number of tokens contained in fields named   * <i>fieldName</i> of <i>doc</i>.   * @return a normalization factor for hits on this field of this document   *   * @see Field#setBoost(float)   */  public abstract float lengthNorm(String fieldName, int numTokens);  /** Computes the normalization value for a query given the sum of the squared   * weights of each of the query terms.  This value is then multipled into the   * weight of each query term.   *   * <p>This does not affect ranking, but rather just attempts to make scores   * from different queries comparable.   *   * @param sumOfSquaredWeights the sum of the squares of query term weights   * @return a normalization factor for query weights   */  public abstract float queryNorm(float sumOfSquaredWeights);  /** Encodes a normalization factor for storage in an index.   *   * <p>The encoding uses a three-bit mantissa, a five-bit exponent, and   * the zero-exponent point at 15, thus   * representing values from around 7x10^9 to 2x10^-9 with about one   * significant decimal digit of accuracy.  Zero is also represented.   * Negative numbers are rounded up to zero.  Values too large to represent   * are rounded down to the largest representable value.  Positive values too   * small to represent are rounded up to the smallest positive representable   * value.   *   * @see Field#setBoost(float)   * @see SmallFloat   */  public static byte encodeNorm(float f) {    return SmallFloat.floatToByte315(f);  }  /** Computes a score factor based on a term or phrase's frequency in a   * document.  This value is multiplied by the {@link #idf(Term, Searcher)}   * factor for each term in the query and these products are then summed to   * form the initial score for a document.   *   * <p>Terms and phrases repeated in a document indicate the topic of the   * document, so implementations of this method usually return larger values   * when <code>freq</code> is large, and smaller values when <code>freq</code>   * is small.   *   * <p>The default implementation calls {@link #tf(float)}.   *   * @param freq the frequency of a term within a document   * @return a score factor based on a term's within-document frequency   */  public float tf(int freq) {    return tf((float)freq);  }  /** Computes the amount of a sloppy phrase match, based on an edit distance.   * This value is summed for each sloppy phrase match in a document to form   * the frequency that is passed to {@link #tf(float)}.   *   * <p>A phrase match with a small edit distance to a document passage more   * closely matches the document, so implementations of this method usually   * return larger values when the edit distance is small and smaller values   * when it is large.   *   * @see PhraseQuery#setSlop(int)   * @param distance the edit distance of this sloppy phrase match   * @return the frequency increment for this match   */  public abstract float sloppyFreq(int distance);  /** Computes a score factor based on a term or phrase's frequency in a   * document.  This value is multiplied by the {@link #idf(Term, Searcher)}   * factor for each term in the query and these products are then summed to   * form the initial score for a document.   *   * <p>Terms and phrases repeated in a document indicate the topic of the   * document, so implementations of this method usually return larger values   * when <code>freq</code> is large, and smaller values when <code>freq</code>   * is small.   *   * @param freq the frequency of a term within a document   * @return a score factor based on a term's within-document frequency   */  public abstract float tf(float freq);  /** Computes a score factor for a simple term.   *   * <p>The default implementation is:<pre>   *   return idf(searcher.docFreq(term), searcher.maxDoc());   * </pre>   *   * Note that {@link Searcher#maxDoc()} is used instead of   * {@link IndexReader#numDocs()} because it is proportional to   * {@link Searcher#docFreq(Term)} , i.e., when one is inaccurate,   * so is the other, and in the same direction.   *   * @param term the term in question   * @param searcher the document collection being searched   * @return a score factor for the term   */  public float idf(Term term, Searcher searcher) throws IOException {    return idf(searcher.docFreq(term), searcher.maxDoc());  }  /** Computes a score factor for a phrase.   *   * <p>The default implementation sums the {@link #idf(Term,Searcher)} factor   * for each term in the phrase.   *   * @param terms the terms in the phrase   * @param searcher the document collection being searched   * @return a score factor for the phrase   */  public float idf(Collection terms, Searcher searcher) throws IOException {    float idf = 0.0f;    Iterator i = terms.iterator();    while (i.hasNext()) {      idf += idf((Term)i.next(), searcher);    }    return idf;  }  /** Computes a score factor based on a term's document frequency (the number   * of documents which contain the term).  This value is multiplied by the   * {@link #tf(int)} factor for each term in the query and these products are   * then summed to form the initial score for a document.   *   * <p>Terms that occur in fewer documents are better indicators of topic, so   * implementations of this method usually return larger values for rare terms,   * and smaller values for common terms.   *   * @param docFreq the number of documents which contain the term   * @param numDocs the total number of documents in the collection   * @return a score factor based on the term's document frequency   */  public abstract float idf(int docFreq, int numDocs);  /** Computes a score factor based on the fraction of all query terms that a   * document contains.  This value is multiplied into scores.   *   * <p>The presence of a large portion of the query terms indicates a better   * match with the query, so implementations of this method usually return   * larger values when the ratio between these parameters is large and smaller   * values when the ratio between them is small.   *   * @param overlap the number of query terms matched in the document   * @param maxOverlap the total number of terms in the query   * @return a score factor based on term overlap with the query   */  public abstract float coord(int overlap, int maxOverlap);}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -