segmentinfo.php
来自「PHP 知识管理系统(基于树结构的知识管理系统), 英文原版的PHP源码。」· PHP 代码 · 共 972 行 · 第 1/2 页
PHP
972 行
// search for appropriate value in dictionary
$lowIndex = 0;
$highIndex = count($this->_termDictionary)-1;
while ($highIndex >= $lowIndex) {
// $mid = ($highIndex - $lowIndex)/2;
$mid = ($highIndex + $lowIndex) >> 1;
$midTerm = $this->_termDictionary[$mid];
$fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
$delta = $searchDicField - $fieldNum;
if ($delta == 0) {
$delta = strcmp($term->text, $midTerm[1] /* text */);
}
if ($delta < 0) {
$highIndex = $mid-1;
} elseif ($delta > 0) {
$lowIndex = $mid+1;
} else {
// return $this->_termDictionaryInfos[$mid]; // We got it!
$a = $this->_termDictionaryInfos[$mid];
$termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]);
// Put loaded termInfo into cache
$this->_termInfoCache[$termKey] = $termInfo;
return $termInfo;
}
}
if ($highIndex == -1) {
// Term is out of the dictionary range
return null;
}
$prevPosition = $highIndex;
$prevTerm = $this->_termDictionary[$prevPosition];
$prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
$tisFile = $this->openCompoundFile('.tis');
$tiVersion = $tisFile->readInt();
if ($tiVersion != (int)0xFFFFFFFE) {
throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
}
$termCount = $tisFile->readLong();
$indexInterval = $tisFile->readInt();
$skipInterval = $tisFile->readInt();
$tisFile->seek($prevTermInfo[4] /* indexPointer */ - 20 /* header size*/, SEEK_CUR);
$termValue = $prevTerm[1] /* text */;
$termFieldNum = $prevTerm[0] /* field */;
$freqPointer = $prevTermInfo[1] /* freqPointer */;
$proxPointer = $prevTermInfo[2] /* proxPointer */;
for ($count = $prevPosition*$indexInterval + 1;
$count <= $termCount &&
( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
($this->_getFieldPosition($termFieldNum) == $searchDicField &&
strcmp($termValue, $term->text) < 0) );
$count++) {
$termPrefixLength = $tisFile->readVInt();
$termSuffix = $tisFile->readString();
$termFieldNum = $tisFile->readVInt();
$termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix;
$docFreq = $tisFile->readVInt();
$freqPointer += $tisFile->readVInt();
$proxPointer += $tisFile->readVInt();
if( $docFreq >= $skipInterval ) {
$skipOffset = $tisFile->readVInt();
} else {
$skipOffset = 0;
}
}
if ($termFieldNum == $searchField && $termValue == $term->text) {
$termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
} else {
$termInfo = null;
}
// Put loaded termInfo into cache
$this->_termInfoCache[$termKey] = $termInfo;
if (count($this->_termInfoCache) == 1024) {
$this->_cleanUpTermInfoCache();
}
return $termInfo;
}
/**
* Load normalizatin factors from an index file
*
* @param integer $fieldNum
*/
private function _loadNorm($fieldNum)
{
$fFile = $this->openCompoundFile('.f' . $fieldNum);
$this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
}
/**
* Returns normalization factor for specified documents
*
* @param integer $id
* @param string $fieldName
* @return float
*/
public function norm($id, $fieldName)
{
$fieldNum = $this->getFieldNum($fieldName);
if ( !($this->_fields[$fieldNum]->isIndexed) ) {
return null;
}
if (!isset($this->_norms[$fieldNum])) {
$this->_loadNorm($fieldNum);
}
return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) );
}
/**
* Returns norm vector, encoded in a byte string
*
* @param string $fieldName
* @return string
*/
public function normVector($fieldName)
{
$fieldNum = $this->getFieldNum($fieldName);
if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) {
$similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
$this->_docCount);
}
if (!isset($this->_norms[$fieldNum])) {
$this->_loadNorm($fieldNum);
}
return $this->_norms[$fieldNum];
}
/**
* Returns true if any documents have been deleted from this index segment.
*
* @return boolean
*/
public function hasDeletions()
{
return $this->_deleted !== null;
}
/**
* Deletes a document from the index segment.
* $id is an internal document id
*
* @param integer
*/
public function delete($id)
{
$this->_deletedDirty = true;
if (extension_loaded('bitset')) {
if ($this->_deleted === null) {
$this->_deleted = bitset_empty($id);
}
bitset_incl($this->_deleted, $id);
} else {
if ($this->_deleted === null) {
$this->_deleted = array();
}
$this->_deleted[$id] = 1;
}
}
/**
* Checks, that document is deleted
*
* @param integer
* @return boolean
*/
public function isDeleted($id)
{
if ($this->_deleted === null) {
return false;
}
if (extension_loaded('bitset')) {
return bitset_in($this->_deleted, $id);
} else {
return isset($this->_deleted[$id]);
}
}
/**
* Write changes if it's necessary.
*/
public function writeChanges()
{
if (!$this->_deletedDirty) {
return;
}
if (extension_loaded('bitset')) {
$delBytes = $this->_deleted;
$bitCount = count(bitset_to_array($delBytes));
} else {
$byteCount = floor($this->_docCount/8)+1;
$delBytes = str_repeat(chr(0), $byteCount);
for ($count = 0; $count < $byteCount; $count++) {
$byte = 0;
for ($bit = 0; $bit < 8; $bit++) {
if (isset($this->_deleted[$count*8 + $bit])) {
$byte |= (1<<$bit);
}
}
$delBytes{$count} = chr($byte);
}
$bitCount = count($this->_deleted);
}
$delFile = $this->_directory->createFile($this->_name . '.del');
$delFile->writeInt($this->_docCount);
$delFile->writeInt($bitCount);
$delFile->writeBytes($delBytes);
$this->_deletedDirty = false;
}
/**
* Term Dictionary File object for stream like terms reading
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_tisFile = null;
/**
* Frequencies File object for stream like terms reading
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_frqFile = null;
/**
* Offset of the .frq file in the compound file
*
* @var integer
*/
private $_frqFileOffset;
/**
* Positions File object for stream like terms reading
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_prxFile = null;
/**
* Offset of the .prx file in the compound file
*
* @var integer
*/
private $_prxFileOffset;
/**
* Number of terms in term stream
*
* @var integer
*/
private $_termCount = 0;
/**
* Segment skip interval
*
* @var integer
*/
private $_skipInterval;
/**
* Last TermInfo in a terms stream
*
* @var Zend_Search_Lucene_Index_TermInfo
*/
private $_lastTermInfo = null;
/**
* Last Term in a terms stream
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_lastTerm = null;
/**
* Map of the document IDs
* Used to get new docID after removing deleted documents.
* It's not very effective from memory usage point of view,
* but much more faster, then other methods
*
* @var array|null
*/
private $_docMap = null;
/**
* An array of all term positions in the documents.
* Array structure: array( docId => array( pos1, pos2, ...), ...)
*
* @var array
*/
private $_lastTermPositions;
/**
* Reset terms stream
*
* $startId - id for the fist document
* $compact - remove deleted documents
*
* Returns start document id for the next segment
*
* @param integer $startId
* @param boolean $compact
* @throws Zend_Search_Lucene_Exception
* @return integer
*/
public function reset($startId = 0, $compact = false)
{
if ($this->_tisFile !== null) {
$this->_tisFile = null;
}
$this->_tisFile = $this->openCompoundFile('.tis', false);
$tiVersion = $this->_tisFile->readInt();
if ($tiVersion != (int)0xFFFFFFFE) {
throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
}
$this->_termCount = $this->_tisFile->readLong();
$this->_tisFile->readInt(); // Read Index interval
$this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval
if ($this->_frqFile !== null) {
$this->_frqFile = null;
}
$this->_frqFile = $this->openCompoundFile('.frq', false);
$this->_frqFileOffset = $this->_frqFile->tell();
if ($this->_prxFile !== null) {
$this->_prxFile = null;
}
$this->_prxFile = $this->openCompoundFile('.prx', false);
$this->_prxFileOffset = $this->_prxFile->tell();
$this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1);
$this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);
$this->_docMap = array();
for ($count = 0; $count < $this->_docCount; $count++) {
if (!$this->isDeleted($count)) {
$this->_docMap[$count] = $startId + ($compact ? count($this->_docMap) : $count);
}
}
$this->nextTerm();
return $startId + ($compact ? count($this->_docMap) : $this->_docCount);
}
/**
* Scans terms dictionary and returns next term
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function nextTerm()
{
if ($this->_tisFile === null || $this->_termCount == 0) {
$this->_lastTerm = null;
$this->_lastTermInfo = null;
// may be necessary for "empty" segment
$this->_tisFile = null;
$this->_frqFile = null;
$this->_prxFile = null;
return null;
}
$termPrefixLength = $this->_tisFile->readVInt();
$termSuffix = $this->_tisFile->readString();
$termFieldNum = $this->_tisFile->readVInt();
$termValue = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix;
$this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name);
$docFreq = $this->_tisFile->readVInt();
$freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt();
$proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt();
if ($docFreq >= $this->_skipInterval) {
$skipOffset = $this->_tisFile->readVInt();
} else {
$skipOffset = 0;
}
$this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
$this->_lastTermPositions = array();
$this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
$freqs = array(); $docId = 0;
for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
$docDelta = $this->_frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
$freqs[ $docId ] = 1;
} else {
$docId += $docDelta/2;
$freqs[ $docId ] = $this->_frqFile->readVInt();
}
}
$this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
foreach ($freqs as $docId => $freq) {
$termPosition = 0; $positions = array();
for ($count = 0; $count < $freq; $count++ ) {
$termPosition += $this->_prxFile->readVInt();
$positions[] = $termPosition;
}
if (isset($this->_docMap[$docId])) {
$this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
}
}
$this->_termCount--;
if ($this->_termCount == 0) {
$this->_tisFile = null;
$this->_frqFile = null;
$this->_prxFile = null;
}
return $this->_lastTerm;
}
/**
* Returns term in current position
*
* @param Zend_Search_Lucene_Index_Term $term
* @return Zend_Search_Lucene_Index_Term|null
*/
public function currentTerm()
{
return $this->_lastTerm;
}
/**
* Returns an array of all term positions in the documents.
* Return array structure: array( docId => array( pos1, pos2, ...), ...)
*
* @return array
*/
public function currentTermPositions()
{
return $this->_lastTermPositions;
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?