segmentwriter.php
来自「PHP 知识管理系统(基于树结构的知识管理系统), 英文原版的PHP源码。」· PHP 代码 · 共 601 行 · 第 1/2 页
PHP
601 行
private $_frqFile = null;
/**
* Positions file
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_prxFile = null;
/**
* Number of written terms
*
* @var integer
*/
private $_termCount;
/**
* Last saved term
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_prevTerm;
/**
* Last saved term info
*
* @var Zend_Search_Lucene_Index_TermInfo
*/
private $_prevTermInfo;
/**
* Last saved index term
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_prevIndexTerm;
/**
* Last saved index term info
*
* @var Zend_Search_Lucene_Index_TermInfo
*/
private $_prevIndexTermInfo;
/**
* Last term dictionary file position
*
* @var integer
*/
private $_lastIndexPosition;
/**
* Create dicrionary, frequency and positions files and write necessary headers
*/
public function initializeDictionaryFiles()
{
$this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
$this->_tisFile->writeInt((int)0xFFFFFFFE);
$this->_tisFile->writeLong(0 /* dummy data for terms count */);
$this->_tisFile->writeInt(self::$indexInterval);
$this->_tisFile->writeInt(self::$skipInterval);
$this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
$this->_tiiFile->writeInt((int)0xFFFFFFFE);
$this->_tiiFile->writeLong(0 /* dummy data for terms count */);
$this->_tiiFile->writeInt(self::$indexInterval);
$this->_tiiFile->writeInt(self::$skipInterval);
/** Dump dictionary header */
$this->_tiiFile->writeVInt(0); // preffix length
$this->_tiiFile->writeString(''); // suffix
$this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number
$this->_tiiFile->writeByte((int)0x0F);
$this->_tiiFile->writeVInt(0); // DocFreq
$this->_tiiFile->writeVInt(0); // FreqDelta
$this->_tiiFile->writeVInt(0); // ProxDelta
$this->_tiiFile->writeVInt(20); // IndexDelta
$this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
$this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
$this->_files[] = $this->_name . '.tis';
$this->_files[] = $this->_name . '.tii';
$this->_files[] = $this->_name . '.frq';
$this->_files[] = $this->_name . '.prx';
$this->_prevTerm = null;
$this->_prevTermInfo = null;
$this->_prevIndexTerm = null;
$this->_prevIndexTermInfo = null;
$this->_lastIndexPosition = 20;
$this->_termCount = 0;
}
/**
* Add term
*
* Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
*
* @param Zend_Search_Lucene_Index_Term $termEntry
* @param array $termDocs
*/
public function addTerm($termEntry, $termDocs)
{
$freqPointer = $this->_frqFile->tell();
$proxPointer = $this->_prxFile->tell();
$prevDoc = 0;
foreach ($termDocs as $docId => $termPositions) {
$docDelta = ($docId - $prevDoc)*2;
$prevDoc = $docId;
if (count($termPositions) > 1) {
$this->_frqFile->writeVInt($docDelta);
$this->_frqFile->writeVInt(count($termPositions));
} else {
$this->_frqFile->writeVInt($docDelta + 1);
}
$prevPosition = 0;
foreach ($termPositions as $position) {
$this->_prxFile->writeVInt($position - $prevPosition);
$prevPosition = $position;
}
}
if (count($termDocs) >= self::$skipInterval) {
/**
* @todo Write Skip Data to a freq file.
* It's not used now, but make index more optimal
*/
$skipOffset = $this->_frqFile->tell() - $freqPointer;
} else {
$skipOffset = 0;
}
$term = new Zend_Search_Lucene_Index_Term($termEntry->text,
$this->_fields[$termEntry->field]->number);
$termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
$freqPointer, $proxPointer, $skipOffset);
$this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
if (($this->_termCount + 1) % self::$indexInterval == 0) {
$this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
$indexPosition = $this->_tisFile->tell();
$this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
$this->_lastIndexPosition = $indexPosition;
}
$this->_termCount++;
}
/**
* Close dictionary
*/
public function closeDictionaryFiles()
{
$this->_tisFile->seek(4);
$this->_tisFile->writeLong($this->_termCount);
$this->_tiiFile->seek(4);
$this->_tiiFile->writeLong(ceil(($this->_termCount + 2)/self::$indexInterval));
}
/**
* Dump Term Dictionary segment file entry.
* Used to write entry to .tis or .tii files
*
* @param Zend_Search_Lucene_Storage_File $dicFile
* @param Zend_Search_Lucene_Index_Term $prevTerm
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
* @param Zend_Search_Lucene_Index_TermInfo $termInfo
*/
protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
&$prevTerm, Zend_Search_Lucene_Index_Term $term,
&$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
{
if (isset($prevTerm) && $prevTerm->field == $term->field) {
$matchedBytes = 0;
$maxBytes = min(strlen($prevTerm->text), strlen($term->text));
while ($matchedBytes < $maxBytes &&
$prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
$matchedBytes++;
}
// Calculate actual matched UTF-8 pattern
$prefixBytes = 0;
$prefixChars = 0;
while ($prefixBytes < $matchedBytes) {
$charBytes = 1;
if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($term->text[$prefixBytes]) & 0x20 ) {
$charBytes++;
if (ord($term->text[$prefixBytes]) & 0x10 ) {
$charBytes++;
}
}
}
if ($prefixBytes + $charBytes > $matchedBytes) {
// char crosses matched bytes boundary
// skip char
break;
}
$prefixChars++;
$prefixBytes += $charBytes;
}
// Write preffix length
$dicFile->writeVInt($prefixChars);
// Write suffix
$dicFile->writeString(substr($term->text, $prefixBytes));
} else {
// Write preffix length
$dicFile->writeVInt(0);
// Write suffix
$dicFile->writeString($term->text);
}
// Write field number
$dicFile->writeVInt($term->field);
// DocFreq (the count of documents which contain the term)
$dicFile->writeVInt($termInfo->docFreq);
$prevTerm = $term;
if (!isset($prevTermInfo)) {
// Write FreqDelta
$dicFile->writeVInt($termInfo->freqPointer);
// Write ProxDelta
$dicFile->writeVInt($termInfo->proxPointer);
} else {
// Write FreqDelta
$dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
// Write ProxDelta
$dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
}
// Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
if ($termInfo->skipOffset != 0) {
$dicFile->writeVInt($termInfo->skipOffset);
}
$prevTermInfo = $termInfo;
}
/**
* Generate compound index file
*/
protected function _generateCFS()
{
$cfsFile = $this->_directory->createFile($this->_name . '.cfs');
$cfsFile->writeVInt(count($this->_files));
$dataOffsetPointers = array();
foreach ($this->_files as $fileName) {
$dataOffsetPointers[$fileName] = $cfsFile->tell();
$cfsFile->writeLong(0); // write dummy data
$cfsFile->writeString($fileName);
}
foreach ($this->_files as $fileName) {
// Get actual data offset
$dataOffset = $cfsFile->tell();
// Seek to the data offset pointer
$cfsFile->seek($dataOffsetPointers[$fileName]);
// Write actual data offset value
$cfsFile->writeLong($dataOffset);
// Seek back to the end of file
$cfsFile->seek($dataOffset);
$dataFile = $this->_directory->getFileObject($fileName);
$byteCount = $this->_directory->fileLength($fileName);
while ($byteCount > 0) {
$data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
$byteCount -= strlen($data);
$cfsFile->writeBytes($data);
}
$this->_directory->deleteFile($fileName);
}
}
/**
* Close segment, write it to disk and return segment info
*
* @return Zend_Search_Lucene_Index_SegmentInfo
*/
abstract public function close();
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?