segmentwriter.php

来自「PHP 知识管理系统(基于树结构的知识管理系统), 英文原版的PHP源码。」· PHP 代码 · 共 601 行 · 第 1/2 页

PHP
601
字号
    private $_frqFile = null;

    /**
     * Positions file
     *
     * @var Zend_Search_Lucene_Storage_File
     */
    private $_prxFile = null;

    /**
     * Number of written terms
     *
     * @var integer
     */
    private $_termCount;


    /**
     * Last saved term
     *
     * @var Zend_Search_Lucene_Index_Term
     */
    private $_prevTerm;

    /**
     * Last saved term info
     *
     * @var Zend_Search_Lucene_Index_TermInfo
     */
    private $_prevTermInfo;

    /**
     * Last saved index term
     *
     * @var Zend_Search_Lucene_Index_Term
     */
    private $_prevIndexTerm;

    /**
     * Last saved index term info
     *
     * @var Zend_Search_Lucene_Index_TermInfo
     */
    private $_prevIndexTermInfo;

    /**
     * Last term dictionary file position
     *
     * @var integer
     */
    private $_lastIndexPosition;

    /**
     * Create dicrionary, frequency and positions files and write necessary headers
     */
    public function initializeDictionaryFiles()
    {
        $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
        $this->_tisFile->writeInt((int)0xFFFFFFFE);
        $this->_tisFile->writeLong(0 /* dummy data for terms count */);
        $this->_tisFile->writeInt(self::$indexInterval);
        $this->_tisFile->writeInt(self::$skipInterval);

        $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
        $this->_tiiFile->writeInt((int)0xFFFFFFFE);
        $this->_tiiFile->writeLong(0 /* dummy data for terms count */);
        $this->_tiiFile->writeInt(self::$indexInterval);
        $this->_tiiFile->writeInt(self::$skipInterval);

        /** Dump dictionary header */
        $this->_tiiFile->writeVInt(0);                    // preffix length
        $this->_tiiFile->writeString('');                 // suffix
        $this->_tiiFile->writeInt((int)0xFFFFFFFF);       // field number
        $this->_tiiFile->writeByte((int)0x0F);
        $this->_tiiFile->writeVInt(0);                    // DocFreq
        $this->_tiiFile->writeVInt(0);                    // FreqDelta
        $this->_tiiFile->writeVInt(0);                    // ProxDelta
        $this->_tiiFile->writeVInt(20);                   // IndexDelta

        $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
        $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');

        $this->_files[] = $this->_name . '.tis';
        $this->_files[] = $this->_name . '.tii';
        $this->_files[] = $this->_name . '.frq';
        $this->_files[] = $this->_name . '.prx';

        $this->_prevTerm          = null;
        $this->_prevTermInfo      = null;
        $this->_prevIndexTerm     = null;
        $this->_prevIndexTermInfo = null;
        $this->_lastIndexPosition = 20;
        $this->_termCount         = 0;

    }

    /**
     * Add term
     *
     * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
     *
     * @param Zend_Search_Lucene_Index_Term $termEntry
     * @param array $termDocs
     */
    public function addTerm($termEntry, $termDocs)
    {
        $freqPointer = $this->_frqFile->tell();
        $proxPointer = $this->_prxFile->tell();

        $prevDoc = 0;
        foreach ($termDocs as $docId => $termPositions) {
            $docDelta = ($docId - $prevDoc)*2;
            $prevDoc = $docId;
            if (count($termPositions) > 1) {
                $this->_frqFile->writeVInt($docDelta);
                $this->_frqFile->writeVInt(count($termPositions));
            } else {
                $this->_frqFile->writeVInt($docDelta + 1);
            }

            $prevPosition = 0;
            foreach ($termPositions as $position) {
                $this->_prxFile->writeVInt($position - $prevPosition);
                $prevPosition = $position;
            }
        }

        if (count($termDocs) >= self::$skipInterval) {
            /**
             * @todo Write Skip Data to a freq file.
             * It's not used now, but make index more optimal
             */
            $skipOffset = $this->_frqFile->tell() - $freqPointer;
        } else {
            $skipOffset = 0;
        }

        $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
                                                  $this->_fields[$termEntry->field]->number);
        $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
                                                          $freqPointer, $proxPointer, $skipOffset);

        $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);

        if (($this->_termCount + 1) % self::$indexInterval == 0) {
            $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);

            $indexPosition = $this->_tisFile->tell();
            $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
            $this->_lastIndexPosition = $indexPosition;

        }
        $this->_termCount++;
    }

    /**
     * Close dictionary
     */
    public function closeDictionaryFiles()
    {
        $this->_tisFile->seek(4);
        $this->_tisFile->writeLong($this->_termCount);

        $this->_tiiFile->seek(4);
        $this->_tiiFile->writeLong(ceil(($this->_termCount + 2)/self::$indexInterval));
    }


    /**
     * Dump Term Dictionary segment file entry.
     * Used to write entry to .tis or .tii files
     *
     * @param Zend_Search_Lucene_Storage_File $dicFile
     * @param Zend_Search_Lucene_Index_Term $prevTerm
     * @param Zend_Search_Lucene_Index_Term $term
     * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
     * @param Zend_Search_Lucene_Index_TermInfo $termInfo
     */
    protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
                                        &$prevTerm,     Zend_Search_Lucene_Index_Term     $term,
                                        &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
    {
        if (isset($prevTerm) && $prevTerm->field == $term->field) {
            $matchedBytes = 0;
            $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
            while ($matchedBytes < $maxBytes  &&
                   $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
                $matchedBytes++;
            }

            // Calculate actual matched UTF-8 pattern
            $prefixBytes = 0;
            $prefixChars = 0;
            while ($prefixBytes < $matchedBytes) {
                $charBytes = 1;
                if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
                    $charBytes++;
                    if (ord($term->text[$prefixBytes]) & 0x20 ) {
                        $charBytes++;
                        if (ord($term->text[$prefixBytes]) & 0x10 ) {
                            $charBytes++;
                        }
                    }
                }

                if ($prefixBytes + $charBytes > $matchedBytes) {
                    // char crosses matched bytes boundary
                    // skip char
                    break;
                }

                $prefixChars++;
                $prefixBytes += $charBytes;
            }

            // Write preffix length
            $dicFile->writeVInt($prefixChars);
            // Write suffix
            $dicFile->writeString(substr($term->text, $prefixBytes));
        } else {
            // Write preffix length
            $dicFile->writeVInt(0);
            // Write suffix
            $dicFile->writeString($term->text);
        }
        // Write field number
        $dicFile->writeVInt($term->field);
        // DocFreq (the count of documents which contain the term)
        $dicFile->writeVInt($termInfo->docFreq);

        $prevTerm = $term;

        if (!isset($prevTermInfo)) {
            // Write FreqDelta
            $dicFile->writeVInt($termInfo->freqPointer);
            // Write ProxDelta
            $dicFile->writeVInt($termInfo->proxPointer);
        } else {
            // Write FreqDelta
            $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
            // Write ProxDelta
            $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
        }
        // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
        if ($termInfo->skipOffset != 0) {
            $dicFile->writeVInt($termInfo->skipOffset);
        }

        $prevTermInfo = $termInfo;
    }


    /**
     * Generate compound index file
     */
    protected function _generateCFS()
    {
        $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
        $cfsFile->writeVInt(count($this->_files));

        $dataOffsetPointers = array();
        foreach ($this->_files as $fileName) {
            $dataOffsetPointers[$fileName] = $cfsFile->tell();
            $cfsFile->writeLong(0); // write dummy data
            $cfsFile->writeString($fileName);
        }

        foreach ($this->_files as $fileName) {
            // Get actual data offset
            $dataOffset = $cfsFile->tell();
            // Seek to the data offset pointer
            $cfsFile->seek($dataOffsetPointers[$fileName]);
            // Write actual data offset value
            $cfsFile->writeLong($dataOffset);
            // Seek back to the end of file
            $cfsFile->seek($dataOffset);

            $dataFile = $this->_directory->getFileObject($fileName);

            $byteCount = $this->_directory->fileLength($fileName);
            while ($byteCount > 0) {
                $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
                $byteCount -= strlen($data);
                $cfsFile->writeBytes($data);
            }

            $this->_directory->deleteFile($fileName);
        }
    }


    /**
     * Close segment, write it to disk and return segment info
     *
     * @return Zend_Search_Lucene_Index_SegmentInfo
     */
    abstract public function close();
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?