segmentinfo.php

来自「PHP 知识管理系统(基于树结构的知识管理系统), 英文原版的PHP源码。」· PHP 代码 · 共 972 行 · 第 1/2 页

PHP
972
字号

        // search for appropriate value in dictionary
        $lowIndex = 0;
        $highIndex = count($this->_termDictionary)-1;
        while ($highIndex >= $lowIndex) {
            // $mid = ($highIndex - $lowIndex)/2;
            $mid = ($highIndex + $lowIndex) >> 1;
            $midTerm = $this->_termDictionary[$mid];

            $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
            $delta = $searchDicField - $fieldNum;
            if ($delta == 0) {
                $delta = strcmp($term->text, $midTerm[1] /* text */);
            }

            if ($delta < 0) {
                $highIndex = $mid-1;
            } elseif ($delta > 0) {
                $lowIndex  = $mid+1;
            } else {
                // return $this->_termDictionaryInfos[$mid]; // We got it!
                $a = $this->_termDictionaryInfos[$mid];
                $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]);

                // Put loaded termInfo into cache
                $this->_termInfoCache[$termKey] = $termInfo;

                return $termInfo;
            }
        }

        if ($highIndex == -1) {
            // Term is out of the dictionary range
            return null;
        }

        $prevPosition = $highIndex;
        $prevTerm = $this->_termDictionary[$prevPosition];
        $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];

        $tisFile = $this->openCompoundFile('.tis');
        $tiVersion = $tisFile->readInt();
        if ($tiVersion != (int)0xFFFFFFFE) {
            throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
        }

        $termCount     = $tisFile->readLong();
        $indexInterval = $tisFile->readInt();
        $skipInterval  = $tisFile->readInt();

        $tisFile->seek($prevTermInfo[4] /* indexPointer */ - 20 /* header size*/, SEEK_CUR);

        $termValue    = $prevTerm[1] /* text */;
        $termFieldNum = $prevTerm[0] /* field */;
        $freqPointer = $prevTermInfo[1] /* freqPointer */;
        $proxPointer = $prevTermInfo[2] /* proxPointer */;
        for ($count = $prevPosition*$indexInterval + 1;
             $count <= $termCount &&
             ( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
              ($this->_getFieldPosition($termFieldNum) == $searchDicField &&
               strcmp($termValue, $term->text) < 0) );
             $count++) {
            $termPrefixLength = $tisFile->readVInt();
            $termSuffix       = $tisFile->readString();
            $termFieldNum     = $tisFile->readVInt();
            $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix;

            $docFreq      = $tisFile->readVInt();
            $freqPointer += $tisFile->readVInt();
            $proxPointer += $tisFile->readVInt();
            if( $docFreq >= $skipInterval ) {
                $skipOffset = $tisFile->readVInt();
            } else {
                $skipOffset = 0;
            }
        }

        if ($termFieldNum == $searchField && $termValue == $term->text) {
            $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
        } else {
            $termInfo = null;
        }

        // Put loaded termInfo into cache
        $this->_termInfoCache[$termKey] = $termInfo;

        if (count($this->_termInfoCache) == 1024) {
            $this->_cleanUpTermInfoCache();
        }

        return $termInfo;
    }

    /**
     * Load normalizatin factors from an index file
     *
     * @param integer $fieldNum
     */
    private function _loadNorm($fieldNum)
    {
        $fFile = $this->openCompoundFile('.f' . $fieldNum);
        $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
    }

    /**
     * Returns normalization factor for specified documents
     *
     * @param integer $id
     * @param string $fieldName
     * @return float
     */
    public function norm($id, $fieldName)
    {
        $fieldNum = $this->getFieldNum($fieldName);

        if ( !($this->_fields[$fieldNum]->isIndexed) ) {
            return null;
        }

        if (!isset($this->_norms[$fieldNum])) {
            $this->_loadNorm($fieldNum);
        }

        return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) );
    }

    /**
     * Returns norm vector, encoded in a byte string
     *
     * @param string $fieldName
     * @return string
     */
    public function normVector($fieldName)
    {
        $fieldNum = $this->getFieldNum($fieldName);

        if ($fieldNum == -1  ||  !($this->_fields[$fieldNum]->isIndexed)) {
            $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();

            return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
                              $this->_docCount);
        }

        if (!isset($this->_norms[$fieldNum])) {
            $this->_loadNorm($fieldNum);
        }

        return $this->_norms[$fieldNum];
    }


    /**
     * Returns true if any documents have been deleted from this index segment.
     *
     * @return boolean
     */
    public function hasDeletions()
    {
        return $this->_deleted !== null;
    }


    /**
     * Deletes a document from the index segment.
     * $id is an internal document id
     *
     * @param integer
     */
    public function delete($id)
    {
        $this->_deletedDirty = true;

        if (extension_loaded('bitset')) {
            if ($this->_deleted === null) {
                $this->_deleted = bitset_empty($id);
            }
            bitset_incl($this->_deleted, $id);
        } else {
            if ($this->_deleted === null) {
                $this->_deleted = array();
            }

            $this->_deleted[$id] = 1;
        }
    }

    /**
     * Checks, that document is deleted
     *
     * @param integer
     * @return boolean
     */
    public function isDeleted($id)
    {
        if ($this->_deleted === null) {
            return false;
        }

        if (extension_loaded('bitset')) {
            return bitset_in($this->_deleted, $id);
        } else {
            return isset($this->_deleted[$id]);
        }
    }


    /**
     * Write changes if it's necessary.
     */
    public function writeChanges()
    {
        if (!$this->_deletedDirty) {
            return;
        }

        if (extension_loaded('bitset')) {
            $delBytes = $this->_deleted;
            $bitCount = count(bitset_to_array($delBytes));
        } else {
            $byteCount = floor($this->_docCount/8)+1;
            $delBytes = str_repeat(chr(0), $byteCount);
            for ($count = 0; $count < $byteCount; $count++) {
                $byte = 0;
                for ($bit = 0; $bit < 8; $bit++) {
                    if (isset($this->_deleted[$count*8 + $bit])) {
                        $byte |= (1<<$bit);
                    }
                }
                $delBytes{$count} = chr($byte);
            }
            $bitCount = count($this->_deleted);
        }


        $delFile = $this->_directory->createFile($this->_name . '.del');
        $delFile->writeInt($this->_docCount);
        $delFile->writeInt($bitCount);
        $delFile->writeBytes($delBytes);

        $this->_deletedDirty = false;
    }



    /**
     * Term Dictionary File object for stream like terms reading
     *
     * @var Zend_Search_Lucene_Storage_File
     */
    private $_tisFile = null;

    /**
     * Frequencies File object for stream like terms reading
     *
     * @var Zend_Search_Lucene_Storage_File
     */
    private $_frqFile = null;

    /**
     * Offset of the .frq file in the compound file
     *
     * @var integer
     */
    private $_frqFileOffset;

    /**
     * Positions File object for stream like terms reading
     *
     * @var Zend_Search_Lucene_Storage_File
     */
    private $_prxFile = null;

    /**
     * Offset of the .prx file in the compound file
     *
     * @var integer
     */
    private $_prxFileOffset;


    /**
     * Number of terms in term stream
     *
     * @var integer
     */
    private $_termCount = 0;

    /**
     * Segment skip interval
     *
     * @var integer
     */
    private $_skipInterval;

    /**
     * Last TermInfo in a terms stream
     *
     * @var Zend_Search_Lucene_Index_TermInfo
     */
    private $_lastTermInfo = null;

    /**
     * Last Term in a terms stream
     *
     * @var Zend_Search_Lucene_Index_Term
     */
    private $_lastTerm = null;

    /**
     * Map of the document IDs
     * Used to get new docID after removing deleted documents.
     * It's not very effective from memory usage point of view,
     * but much more faster, then other methods
     *
     * @var array|null
     */
    private $_docMap = null;

    /**
     * An array of all term positions in the documents.
     * Array structure: array( docId => array( pos1, pos2, ...), ...)
     *
     * @var array
     */
    private $_lastTermPositions;

    /**
     * Reset terms stream
     *
     * $startId - id for the fist document
     * $compact - remove deleted documents
     *
     * Returns start document id for the next segment
     *
     * @param integer $startId
     * @param boolean $compact
     * @throws Zend_Search_Lucene_Exception
     * @return integer
     */
    public function reset($startId = 0, $compact = false)
    {
        if ($this->_tisFile !== null) {
            $this->_tisFile = null;
        }

        $this->_tisFile = $this->openCompoundFile('.tis', false);
        $tiVersion = $this->_tisFile->readInt();
        if ($tiVersion != (int)0xFFFFFFFE) {
            throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
        }

        $this->_termCount    = $this->_tisFile->readLong();
                               $this->_tisFile->readInt();  // Read Index interval
        $this->_skipInterval = $this->_tisFile->readInt();  // Read skip interval

        if ($this->_frqFile !== null) {
            $this->_frqFile = null;
        }
        $this->_frqFile = $this->openCompoundFile('.frq', false);
        $this->_frqFileOffset = $this->_frqFile->tell();

        if ($this->_prxFile !== null) {
            $this->_prxFile = null;
        }
        $this->_prxFile = $this->openCompoundFile('.prx', false);
        $this->_prxFileOffset = $this->_prxFile->tell();

        $this->_lastTerm     = new Zend_Search_Lucene_Index_Term('', -1);
        $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);

        $this->_docMap = array();
        for ($count = 0; $count < $this->_docCount; $count++) {
            if (!$this->isDeleted($count)) {
                $this->_docMap[$count] = $startId + ($compact ? count($this->_docMap) : $count);
            }
        }

        $this->nextTerm();
        return $startId + ($compact ? count($this->_docMap) : $this->_docCount);
    }


    /**
     * Scans terms dictionary and returns next term
     *
     * @return Zend_Search_Lucene_Index_Term|null
     */
    public function nextTerm()
    {
        if ($this->_tisFile === null  ||  $this->_termCount == 0) {
            $this->_lastTerm     = null;
            $this->_lastTermInfo = null;

            // may be necessary for "empty" segment
            $this->_tisFile = null;
            $this->_frqFile = null;
            $this->_prxFile = null;

            return null;
        }

        $termPrefixLength = $this->_tisFile->readVInt();
        $termSuffix       = $this->_tisFile->readString();
        $termFieldNum     = $this->_tisFile->readVInt();
        $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix;

        $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name);

        $docFreq     = $this->_tisFile->readVInt();
        $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt();
        $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt();
        if ($docFreq >= $this->_skipInterval) {
            $skipOffset = $this->_tisFile->readVInt();
        } else {
            $skipOffset = 0;
        }

        $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);


        $this->_lastTermPositions = array();

        $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
        $freqs = array();   $docId = 0;
        for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
            $docDelta = $this->_frqFile->readVInt();
            if( $docDelta % 2 == 1 ) {
                $docId += ($docDelta-1)/2;
                $freqs[ $docId ] = 1;
            } else {
                $docId += $docDelta/2;
                $freqs[ $docId ] = $this->_frqFile->readVInt();
            }
        }

        $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
        foreach ($freqs as $docId => $freq) {
            $termPosition = 0;  $positions = array();

            for ($count = 0; $count < $freq; $count++ ) {
                $termPosition += $this->_prxFile->readVInt();
                $positions[] = $termPosition;
            }

            if (isset($this->_docMap[$docId])) {
                $this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
            }
        }


        $this->_termCount--;
        if ($this->_termCount == 0) {
            $this->_tisFile = null;
            $this->_frqFile = null;
            $this->_prxFile = null;
        }

        return $this->_lastTerm;
    }


    /**
     * Returns term in current position
     *
     * @param Zend_Search_Lucene_Index_Term $term
     * @return Zend_Search_Lucene_Index_Term|null
     */
    public function currentTerm()
    {
        return $this->_lastTerm;
    }


    /**
     * Returns an array of all term positions in the documents.
     * Return array structure: array( docId => array( pos1, pos2, ...), ...)
     *
     * @return array
     */
    public function currentTermPositions()
    {
        return $this->_lastTermPositions;
    }
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?