📄 similarity.cs

📁 Lucene.Net 版本源码测试通过
💻 CS
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

using System;

using IndexReader = Lucene.Net.Index.IndexReader;
using IndexWriter = Lucene.Net.Index.IndexWriter;
using Term = Lucene.Net.Index.Term;
using SmallFloat = Lucene.Net.Util.SmallFloat;

namespace Lucene.Net.Search
{
	
	/// <summary>Expert: Scoring API.
	/// <p>Subclasses implement search scoring.
	/// 
	/// <p>The score of query <code>q</code> for document <code>d</code> correlates to the
	/// cosine-distance or dot-product between document and query vectors in a
	/// <a href="http://en.wikipedia.org/wiki/Vector_Space_Model">
	/// Vector Space Model (VSM) of Information Retrieval</a>.
	/// A document whose vector is closer to the query vector in that model is scored higher.
	/// 
	/// The score is computed as follows:
	/// 
	/// <P>
	/// <table cellpadding="1" cellspacing="0" border="1" align="center">
	/// <tr><td>
	/// <table cellpadding="1" cellspacing="0" border="0" align="center">
	/// <tr>
	/// <td valign="middle" align="right" rowspan="1">
	/// score(q,d) &nbsp; = &nbsp;
	/// <A HREF="#formula_coord">coord(q,d)</A> &nbsp;&middot;&nbsp;
	/// <A HREF="#formula_queryNorm">queryNorm(q)</A> &nbsp;&middot;&nbsp;
	/// </td>
	/// <td valign="bottom" align="center" rowspan="1">
	/// <big><big><big>&sum;</big></big></big>
	/// </td>
	/// <td valign="middle" align="right" rowspan="1">
	/// <big><big>(</big></big>
	/// <A HREF="#formula_tf">tf(t in d)</A> &nbsp;&middot;&nbsp;
	/// <A HREF="#formula_idf">idf(t)</A><sup>2</sup> &nbsp;&middot;&nbsp;
	/// <A HREF="#formula_termBoost">t.getBoost()</A>&nbsp;&middot;&nbsp;
	/// <A HREF="#formula_norm">norm(t,d)</A>
	/// <big><big>)</big></big>
	/// </td>
	/// </tr>
	/// <tr valigh="top">
	/// <td></td>
	/// <td align="center"><small>t in q</small></td>
	/// <td></td>
	/// </tr>
	/// </table>
	/// </td></tr>
	/// </table>
	/// 
	/// <p> where
	/// <ol>
	/// <li>
	/// <A NAME="formula_tf"></A>
	/// <b>tf(t in d)</b>
	/// correlates to the term's <i>frequency</i>,
	/// defined as the number of times term <i>t</i> appears in the currently scored document <i>d</i>.
	/// Documents that have more occurrences of a given term receive a higher score.
	/// The default computation for <i>tf(t in d)</i> in
	/// {@link Lucene.Net.Search.DefaultSimilarity#Tf(float) DefaultSimilarity} is:
	/// 
	/// <br>&nbsp;<br>
	/// <table cellpadding="2" cellspacing="2" border="0" align="center">
	/// <tr>
	/// <td valign="middle" align="right" rowspan="1">
	/// {@link Lucene.Net.Search.DefaultSimilarity#Tf(float) tf(t in d)} &nbsp; = &nbsp;
	/// </td>
	/// <td valign="top" align="center" rowspan="1">
	/// frequency<sup><big>&frac12;</big></sup>
	/// </td>
	/// </tr>
	/// </table>
	/// <br>&nbsp;<br>
	/// </li>
	/// 
	/// <li>
	/// <A NAME="formula_idf"></A>
	/// <b>idf(t)</b> stands for Inverse Document Frequency. This value
	/// correlates to the inverse of <i>docFreq</i>
	/// (the number of documents in which the term <i>t</i> appears).
	/// This means rarer terms give higher contribution to the total score.
	/// The default computation for <i>idf(t)</i> in
	/// {@link Lucene.Net.Search.DefaultSimilarity#Idf(int, int) DefaultSimilarity} is:
	/// 
	/// <br>&nbsp;<br>
	/// <table cellpadding="2" cellspacing="2" border="0" align="center">
	/// <tr>
	/// <td valign="middle" align="right">
	/// {@link Lucene.Net.Search.DefaultSimilarity#Idf(int, int) idf(t)}&nbsp; = &nbsp;
	/// </td>
	/// <td valign="middle" align="center">
	/// 1 + log <big>(</big>
	/// </td>
	/// <td valign="middle" align="center">
	/// <table>
	/// <tr><td align="center"><small>numDocs</small></td></tr>
	/// <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
	/// <tr><td align="center"><small>docFreq+1</small></td></tr>
	/// </table>
	/// </td>
	/// <td valign="middle" align="center">
	/// <big>)</big>
	/// </td>
	/// </tr>
	/// </table>
	/// <br>&nbsp;<br>
	/// </li>
	/// 
	/// <li>
	/// <A NAME="formula_coord"></A>
	/// <b>coord(q,d)</b>
	/// is a score factor based on how many of the query terms are found in the specified document.
	/// Typically, a document that contains more of the query's terms will receive a higher score
	/// than another document with fewer query terms.
	/// This is a search time factor computed in
	/// {@link #Coord(int, int) coord(q,d)}
	/// by the Similarity in effect at search time.
	/// <br>&nbsp;<br>
	/// </li>
	/// 
	/// <li><b>
	/// <A NAME="formula_queryNorm"></A>
	/// queryNorm(q)
	/// </b>
	/// is a normalizing factor used to make scores between queries comparable.
	/// This factor does not affect document ranking (since all ranked documents are multiplied by the same factor),
	/// but rather just attempts to make scores from different queries (or even different indexes) comparable.
	/// This is a search time factor computed by the Similarity in effect at search time.
	/// 
	/// The default computation in
	/// {@link Lucene.Net.Search.DefaultSimilarity#QueryNorm(float) DefaultSimilarity}
	/// is:
	/// <br>&nbsp;<br>
	/// <table cellpadding="1" cellspacing="0" border="0" align="center">
	/// <tr>
	/// <td valign="middle" align="right" rowspan="1">
	/// queryNorm(q)  &nbsp; = &nbsp;
	/// {@link Lucene.Net.Search.DefaultSimilarity#QueryNorm(float) queryNorm(sumOfSquaredWeights)}
	/// &nbsp; = &nbsp;
	/// </td>
	/// <td valign="middle" align="center" rowspan="1">
	/// <table>
	/// <tr><td align="center"><big>1</big></td></tr>
	/// <tr><td align="center"><big>
	/// &ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;
	/// </big></td></tr>
	/// <tr><td align="center">sumOfSquaredWeights<sup><big>&frac12;</big></sup></td></tr>
	/// </table>
	/// </td>
	/// </tr>
	/// </table>
	/// <br>&nbsp;<br>
	/// 
	/// The sum of squared weights (of the query terms) is
	/// computed by the query {@link Lucene.Net.Search.Weight} object.
	/// For example, a {@link Lucene.Net.Search.BooleanQuery boolean query}
	/// computes this value as:
	/// 
	/// <br>&nbsp;<br>
	/// <table cellpadding="1" cellspacing="0" border="0"n align="center">
	/// <tr>
	/// <td valign="middle" align="right" rowspan="1">
	/// {@link Lucene.Net.Search.Weight#SumOfSquaredWeights() sumOfSquaredWeights} &nbsp; = &nbsp;
	/// {@link Lucene.Net.Search.Query#GetBoost() q.getBoost()} <sup><big>2</big></sup>
	/// &nbsp;&middot;&nbsp;
	/// </td>
	/// <td valign="bottom" align="center" rowspan="1">
	/// <big><big><big>&sum;</big></big></big>
	/// </td>
	/// <td valign="middle" align="right" rowspan="1">
	/// <big><big>(</big></big>
	/// <A HREF="#formula_idf">idf(t)</A> &nbsp;&middot;&nbsp;
	/// <A HREF="#formula_termBoost">t.getBoost()</A>
	/// <big><big>) <sup>2</sup> </big></big>
	/// </td>
	/// </tr>
	/// <tr valigh="top">
	/// <td></td>
	/// <td align="center"><small>t in q</small></td>
	/// <td></td>
	/// </tr>
	/// </table>
	/// <br>&nbsp;<br>
	/// 
	/// </li>
	/// 
	/// <li>
	/// <A NAME="formula_termBoost"></A>
	/// <b>t.getBoost()</b>
	/// is a search time boost of term <i>t</i> in the query <i>q</i> as
	/// specified in the query text
	/// (see <A HREF="../../../../../queryparsersyntax.html#Boosting a Term">query syntax</A>),
	/// or as set by application calls to
	/// {@link Lucene.Net.Search.Query#SetBoost(float) setBoost()}.
	/// Notice that there is really no direct API for accessing a boost of one term in a multi term query,
	/// but rather multi terms are represented in a query as multi
	/// {@link Lucene.Net.Search.TermQuery TermQuery} objects,
	/// and so the boost of a term in the query is accessible by calling the sub-query
	/// {@link Lucene.Net.Search.Query#GetBoost() getBoost()}.
	/// <br>&nbsp;<br>
	/// </li>
	/// 
	/// <li>
	/// <A NAME="formula_norm"></A>
	/// <b>norm(t,d)</b> encapsulates a few (indexing time) boost and length factors:
	/// 
	/// <ul>
	/// <li><b>Document boost</b> - set by calling
	/// {@link Lucene.Net.Documents.Document#SetBoost(float) doc.setBoost()}
	/// before adding the document to the index.
	/// </li>
	/// <li><b>Field boost</b> - set by calling
	/// {@link Lucene.Net.Documents.Fieldable#SetBoost(float) field.setBoost()}
	/// before adding the field to a document.
	/// </li>
	/// <li>{@link #LengthNorm(String, int) <b>lengthNorm</b>(field)} - computed
	/// when the document is added to the index in accordance with the number of tokens
	/// of this field in the document, so that shorter fields contribute more to the score.
	/// LengthNorm is computed by the Similarity class in effect at indexing.
	/// </li>
	/// </ul>
	/// 
	/// <p>
	/// When a document is added to the index, all the above factors are multiplied.
	/// If the document has multiple fields with the same name, all their boosts are multiplied together:
	/// 
	/// <br>&nbsp;<br>
	/// <table cellpadding="1" cellspacing="0" border="0"n align="center">
	/// <tr>
	/// <td valign="middle" align="right" rowspan="1">
	/// norm(t,d) &nbsp; = &nbsp;
	/// {@link Lucene.Net.Documents.Document#GetBoost() doc.getBoost()}
	/// &nbsp;&middot;&nbsp;
	/// {@link #LengthNorm(String, int) lengthNorm(field)}
	/// &nbsp;&middot;&nbsp;
	/// </td>
	/// <td valign="bottom" align="center" rowspan="1">
	/// <big><big><big>&prod;</big></big></big>
	/// </td>
	/// <td valign="middle" align="right" rowspan="1">
	/// {@link Lucene.Net.Documents.Fieldable#GetBoost() f.getBoost}()
	/// </td>
	/// </tr>
	/// <tr valigh="top">
	/// <td></td>
	/// <td align="center"><small>field <i><b>f</b></i> in <i>d</i> named as <i><b>t</b></i></small></td>
	/// <td></td>
	/// </tr>
	/// </table>
	/// <br>&nbsp;<br>
	/// However the resulted <i>norm</i> value is {@link #EncodeNorm(float) encoded} as a single byte
	/// before being stored.
	/// At search time, the norm byte value is read from the index
	/// {@link Lucene.Net.Store.Directory directory} and
	/// {@link #DecodeNorm(byte) decoded} back to a float <i>norm</i> value.
	/// This encoding/decoding, while reducing index size, comes with the price of
	/// precision loss - it is not guaranteed that decode(encode(x)) = x.
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -