similarity.php

来自「PHP 知识管理系统(基于树结构的知识管理系统), 英文原版的PHP源码。」· PHP 代码 · 共 554 行 · 第 1/2 页

PHP
554
字号
                                        229 => 8.388608E7,
                                        230 => 1.00663296E8,
                                        231 => 1.17440512E8,
                                        232 => 1.34217728E8,
                                        233 => 1.6777216E8,
                                        234 => 2.01326592E8,
                                        235 => 2.34881024E8,
                                        236 => 2.68435456E8,
                                        237 => 3.3554432E8,
                                        238 => 4.02653184E8,
                                        239 => 4.69762048E8,
                                        240 => 5.3687091E8,
                                        241 => 6.7108864E8,
                                        242 => 8.0530637E8,
                                        243 => 9.395241E8,
                                        244 => 1.07374182E9,
                                        245 => 1.34217728E9,
                                        246 => 1.61061274E9,
                                        247 => 1.87904819E9,
                                        248 => 2.14748365E9,
                                        249 => 2.68435456E9,
                                        250 => 3.22122547E9,
                                        251 => 3.75809638E9,
                                        252 => 4.2949673E9,
                                        253 => 5.3687091E9,
                                        254 => 6.4424509E9,
                                        255 => 7.5161928E9 );


    /**
     * Set the default Similarity implementation used by indexing and search
     * code.
     *
     * @param Zend_Search_Lucene_Search_Similarity $similarity
     */
    static public function setDefault(Zend_Search_Lucene_Search_Similarity $similarity)
    {
        self::$_defaultImpl = $similarity;
    }


    /**
     * Return the default Similarity implementation used by indexing and search
     * code.
     *
     * @return Zend_Search_Lucene_Search_Similarity
     */
    static public function getDefault()
    {
        if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) {
            self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default();
        }

        return self::$_defaultImpl;
    }


    /**
     * Computes the normalization value for a field given the total number of
     * terms contained in a field.  These values, together with field boosts, are
     * stored in an index and multipled into scores for hits on each field by the
     * search code.
     *
     * Matches in longer fields are less precise, so implemenations of this
     * method usually return smaller values when 'numTokens' is large,
     * and larger values when 'numTokens' is small.
     *
     * That these values are computed under
     * IndexWriter::addDocument(Document) and stored then using
     * encodeNorm(float).  Thus they have limited precision, and documents
     * must be re-indexed if this method is altered.
     *
     * fieldName - name of field
     * numTokens - the total number of tokens contained in fields named
     *             'fieldName' of 'doc'.
     * Returns a normalization factor for hits on this field of this document
     *
     * @param string $fieldName
     * @param integer $numTokens
     * @return float
     */
    abstract public function lengthNorm($fieldName, $numTokens);

    /**
     * Computes the normalization value for a query given the sum of the squared
     * weights of each of the query terms.  This value is then multipled into the
     * weight of each query term.
     *
     * This does not affect ranking, but rather just attempts to make scores
     * from different queries comparable.
     *
     * sumOfSquaredWeights - the sum of the squares of query term weights
     * Returns a normalization factor for query weights
     *
     * @param float $sumOfSquaredWeights
     * @return float
     */
    abstract public function queryNorm($sumOfSquaredWeights);


    /**
     *  Decodes a normalization factor stored in an index.
     *
     * @param integer $byte
     * @return float
     */
    static public function decodeNorm($byte)
    {
        return self::$_normTable[$byte & 0xFF];
    }


    /**
     * Encodes a normalization factor for storage in an index.
     *
     * The encoding uses a five-bit exponent and three-bit mantissa, thus
     * representing values from around 7x10^9 to 2x10^-9 with about one
     * significant decimal digit of accuracy.  Zero is also represented.
     * Negative numbers are rounded up to zero.  Values too large to represent
     * are rounded down to the largest representable value.  Positive values too
     * small to represent are rounded up to the smallest positive representable
     * value.
     *
     * @param float $f
     * @return integer
     */
    static function encodeNorm($f)
    {
      return self::_floatToByte($f);
    }

    /**
     * Float to byte conversion
     *
     * @param integer $b
     * @return float
     */
    static private function _floatToByte($f)
    {
        // round negatives up to zero
        if ($f <= 0.0) {
            return 0;
        }

        // search for appropriate value
        $lowIndex = 0;
        $highIndex = 255;
        while ($highIndex >= $lowIndex) {
            // $mid = ($highIndex - $lowIndex)/2;
            $mid = ($highIndex + $lowIndex) >> 1;
            $delta = $f - self::$_normTable[$mid];

            if ($delta < 0) {
                $highIndex = $mid-1;
            } elseif ($delta > 0) {
                $lowIndex  = $mid+1;
            } else {
                return $mid; // We got it!
            }
        }

        // round to closest value
        if ($highIndex != 255 &&
            $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) {
            return $highIndex + 1;
        } else {
            return $highIndex;
        }
    }


    /**
     * Computes a score factor based on a term or phrase's frequency in a
     * document.  This value is multiplied by the idf(Term, Searcher)
     * factor for each term in the query and these products are then summed to
     * form the initial score for a document.
     *
     * Terms and phrases repeated in a document indicate the topic of the
     * document, so implementations of this method usually return larger values
     * when 'freq' is large, and smaller values when 'freq'
     * is small.
     *
     * freq - the frequency of a term within a document
     * Returns a score factor based on a term's within-document frequency
     *
     * @param float $freq
     * @return float
     */
    abstract public function tf($freq);

    /**
     * Computes the amount of a sloppy phrase match, based on an edit distance.
     * This value is summed for each sloppy phrase match in a document to form
     * the frequency that is passed to tf(float).
     *
     * A phrase match with a small edit distance to a document passage more
     * closely matches the document, so implementations of this method usually
     * return larger values when the edit distance is small and smaller values
     * when it is large.
     *
     * distance - the edit distance of this sloppy phrase match
     * Returns the frequency increment for this match
     *
     * @param integer $distance
     * @return float
     */
    abstract public function sloppyFreq($distance);


    /**
     * Computes a score factor for a simple term or a phrase.
     *
     * The default implementation is:
     *   return idfFreq(searcher.docFreq(term), searcher.maxDoc());
     *
     * input - the term in question or array of terms
     * reader - reader the document collection being searched
     * Returns a score factor for the term
     *
     * @param mixed $input
     * @param Zend_Search_Lucene $reader
     * @return a score factor for the term
     */
    public function idf($input, $reader)
    {
        if (!is_array($input)) {
            return $this->idfFreq($reader->docFreq($input), $reader->count());
        } else {
            $idf = 0.0;
            foreach ($input as $term) {
                $idf += $this->idfFreq($reader->docFreq($term), $reader->count());
            }
            return $idf;
        }
    }

    /**
     * Computes a score factor based on a term's document frequency (the number
     * of documents which contain the term).  This value is multiplied by the
     * tf(int) factor for each term in the query and these products are
     * then summed to form the initial score for a document.
     *
     * Terms that occur in fewer documents are better indicators of topic, so
     * implemenations of this method usually return larger values for rare terms,
     * and smaller values for common terms.
     *
     * docFreq - the number of documents which contain the term
     * numDocs - the total number of documents in the collection
     * Returns a score factor based on the term's document frequency
     *
     * @param integer $docFreq
     * @param integer $numDocs
     * @return float
     */
    abstract public function idfFreq($docFreq, $numDocs);

    /**
     * Computes a score factor based on the fraction of all query terms that a
     * document contains.  This value is multiplied into scores.
     *
     * The presence of a large portion of the query terms indicates a better
     * match with the query, so implemenations of this method usually return
     * larger values when the ratio between these parameters is large and smaller
     * values when the ratio between them is small.
     *
     * overlap - the number of query terms matched in the document
     * maxOverlap - the total number of terms in the query
     * Returns a score factor based on term overlap with the query
     *
     * @param integer $overlap
     * @param integer $maxOverlap
     * @return float
     */
    abstract public function coord($overlap, $maxOverlap);
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?