📄 phrase.php
字号:
* Score calculator for exact phrase queries (terms sequence is fixed) * * @param integer $docId * @return float */ public function _exactPhraseFreq($docId) { $freq = 0; // Term Id with lowest cardinality $lowCardTermId = null; // Calculate $lowCardTermId foreach ($this->_terms as $termId => $term) { if ($lowCardTermId === null || count($this->_termsPositions[$termId][$docId]) < count($this->_termsPositions[$lowCardTermId][$docId]) ) { $lowCardTermId = $termId; } } // Walk through positions of the term with lowest cardinality foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) { // We expect phrase to be found $freq++; // Walk through other terms foreach ($this->_terms as $termId => $term) { if ($termId != $lowCardTermId) { $expectedPosition = $lowCardPos + ($this->_offsets[$termId] - $this->_offsets[$lowCardTermId]); if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) { $freq--; // Phrase wasn't found. break; } } } } return $freq; } /** * Score calculator for sloppy phrase queries (terms sequence is fixed) * * @param integer $docId * @param Zend_Search_Lucene_Interface $reader * @return float */ public function _sloppyPhraseFreq($docId, Zend_Search_Lucene_Interface $reader) { $freq = 0; $phraseQueue = array(); $phraseQueue[0] = array(); // empty phrase $lastTerm = null; // Walk through the terms to create phrases. foreach ($this->_terms as $termId => $term) { $queueSize = count($phraseQueue); $firstPass = true; // Walk through the term positions. // Each term position produces a set of phrases. foreach ($this->_termsPositions[$termId][$docId] as $termPosition ) { if ($firstPass) { for ($count = 0; $count < $queueSize; $count++) { $phraseQueue[$count][$termId] = $termPosition; } } else { for ($count = 0; $count < $queueSize; $count++) { if ($lastTerm !== null && abs( $termPosition - $phraseQueue[$count][$lastTerm] - ($this->_offsets[$termId] - $this->_offsets[$lastTerm])) > $this->_slop) { continue; } $newPhraseId = count($phraseQueue); $phraseQueue[$newPhraseId] = $phraseQueue[$count]; $phraseQueue[$newPhraseId][$termId] = $termPosition; } } $firstPass = false; } $lastTerm = $termId; } foreach ($phraseQueue as $phrasePos) { $minDistance = null; for ($shift = -$this->_slop; $shift <= $this->_slop; $shift++) { $distance = 0; $start = reset($phrasePos) - reset($this->_offsets) + $shift; foreach ($this->_terms as $termId => $term) { $distance += abs($phrasePos[$termId] - $this->_offsets[$termId] - $start); if($distance > $this->_slop) { break; } } if ($minDistance === null || $distance < $minDistance) { $minDistance = $distance; } } if ($minDistance <= $this->_slop) { $freq += $reader->getSimilarity()->sloppyFreq($minDistance); } } return $freq; } /** * Execute query in context of index reader * It also initializes necessary internal structures * * @param Zend_Search_Lucene_Interface $reader */ public function execute(Zend_Search_Lucene_Interface $reader) { $this->_resVector = null; if (count($this->_terms) == 0) { $this->_resVector = array(); }
$resVectors = array();
$resVectorsSizes = array();
$resVectorsIds = array(); // is used to prevent arrays comparison
foreach ($this->_terms as $termId => $term) {
$resVectors[] = array_flip($reader->termDocs($term));
$resVectorsSizes[] = count(end($resVectors));
$resVectorsIds[] = $termId;
$this->_termsPositions[$termId] = $reader->termPositions($term);
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC,
$resVectorsIds, SORT_ASC, SORT_NUMERIC,
$resVectors);
foreach ($resVectors as $nextResVector) {
if($this->_resVector === null) {
$this->_resVector = $nextResVector;
} else {
//$this->_resVector = array_intersect_key($this->_resVector, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($this->_resVector as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$this->_resVector = $updatedVector;
}
if (count($this->_resVector) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
// ksort($this->_resVector, SORT_NUMERIC);
// Docs are returned ordered. Used algorithm doesn't change elements order.
// Initialize weight if it's not done yet $this->_initWeight($reader); } /** * Get document ids likely matching the query * * It's an array with document ids as keys (performance considerations) * * @return array */ public function matchedDocs() { return $this->_resVector; } /** * Score specified document * * @param integer $docId * @param Zend_Search_Lucene_Interface $reader * @return float */ public function score($docId, Zend_Search_Lucene_Interface $reader) { if (isset($this->_resVector[$docId])) { if ($this->_slop == 0) { $freq = $this->_exactPhraseFreq($docId); } else { $freq = $this->_sloppyPhraseFreq($docId, $reader); } if ($freq != 0) { $tf = $reader->getSimilarity()->tf($freq); $weight = $this->_weight->getValue(); $norm = $reader->norm($docId, reset($this->_terms)->field); return $tf * $weight * $norm * $this->getBoost(); } // Included in result, but culculated freq is zero return 0; } else { return 0; } } /** * Return query terms * * @return array */ public function getQueryTerms() { return $this->_terms; } /** * Highlight query terms * * @param integer &$colorIndex * @param Zend_Search_Lucene_Document_Html $doc */ public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex) { $words = array(); foreach ($this->_terms as $term) { $words[] = $term->text; } $doc->highlight($words, $this->_getHighlightColor($colorIndex)); } /** * Print a query * * @return string */ public function __toString() { // It's used only for query visualisation, so we don't care about characters escaping $query = ''; if (isset($this->_terms[0]) && $this->_terms[0]->field !== null) { $query .= $this->_terms[0]->field . ':'; } $query .= '"'; foreach ($this->_terms as $id => $term) { if ($id != 0) { $query .= ' '; } $query .= $term->text; } $query .= '"'; if ($this->_slop != 0) { $query .= '~' . $this->_slop; } return $query; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -