📄 indexercore.inc.php
字号:
public function restartBatch()
{
$this->restartCurrentBatch = true;
}
/**
*
* @param int $documentId
* @param string $message
* @param string $level This may be info, error, debug
*/
private function logPendingDocumentInfoStatus($documentId, $message, $level)
{
$this->updatePendingDocumentStatus($documentId, $message, $level);
global $default;
switch ($level)
{
case 'debug':
if ($this->debug)
{
$default->log->debug($message);
}
break;
default:
$default->log->$level($message);
}
}
public function getExtractor($extractorClass)
{
if (empty($extractorClass))
{
return null;
}
$includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';
if (!file_exists($includeFile))
{
throw new Exception("Extractor file does not exist: $includeFile");
}
require_once($includeFile);
if (!class_exists($extractorClass))
{
throw new Exception("Extractor '$classname' not defined in file: $includeFile");
}
$extractor = new $extractorClass();
if (!($extractor instanceof DocumentExtractor))
{
throw new Exception("Class $classname was expected to be of type DocumentExtractor");
}
return $extractor;
}
public static function getIndexingQueue($problemItemsOnly=true)
{
if ($problemItemsOnly)
{
$sql = "SELECT
iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
FROM
index_files iff
INNER JOIN documents d ON iff.document_id=d.id
INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
INNER JOIN mime_types mt ON dcv.mime_id=mt.id
LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
WHERE
(iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1
ORDER BY indexdate ";
}
else
{
$sql = "SELECT
iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
FROM
index_files iff
INNER JOIN documents d ON iff.document_id=d.id
INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
INNER JOIN mime_types mt ON dcv.mime_id=mt.id
LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
WHERE
(iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1
ORDER BY indexdate ";
}
$aResult = DBUtil::getResultArray($sql);
return $aResult;
}
public static function getPendingIndexingQueue()
{
return Indexer::getIndexingQueue(false);
}
public function updateIndexStats()
{
$optimisationDate = KTUtil::getSystemSetting('luceneOptimisationDate', '');
$noOptimisation = false;
if ($optimisationDate == '')
{
$optimisationDate = _kt('N/A');
$optimisationPeriod = $optimisationDate;
}
else
{
$optimisationPeriod = KTUtil::computePeriodToDate($optimisationDate, null, true);
$noOptimisation = $optimisationPeriod['days'] > 2;
$optimisationPeriod = $optimisationPeriod['str'];
$optimisationDate = date('Y-m-d H:i:s', $optimisationDate);
}
$indexingDate = KTUtil::getSystemSetting('luceneIndexingDate', '');
if ($indexingDate == '')
{
$indexingDate = _kt('N/A');
$indexingPeriod = $indexingDate;
}
else
{
$indexingPeriod = KTUtil::computePeriodToDate($indexingDate);
$indexingDate = date('Y-m-d H:i:s', $indexingDate);
}
$index = Indexer::get();
$docsInIndex = $index->getDocumentsInIndex();
// we are only interested in documents that are active
$sql = "SELECT count(*) as docsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is null or i.status_msg = '') and d.status_id=1";
$docsInQueue = DBUtil::getOneResultKey($sql, 'docsInQueue');
$sql = "SELECT count(*) as errorsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is not null or i.status_msg <> '') and d.status_id=1";
$errorsInQueue = DBUtil::getOneResultKey($sql, 'errorsInQueue');
$sql = "SELECT count(*) as docsInRepository FROM documents where status_id=1";
$docsInRepository = DBUtil::getOneResultKey($sql, 'docsInRepository');
if ($docsInRepository == 0)
{
$indexingCoverage = '0.00%';
$queueCoverage = $indexingCoverage;
}
else
{
// compute indexing coverage
$indexingCoverage = _kt('Not Available');
if (is_numeric($docsInIndex))
{
$indexingCoverage = ($docsInIndex * 100) / $docsInRepository;
$indexingCoverage = number_format($indexingCoverage, 2, '.',',') . '%';
}
// compute queue coverage
$queueCoverage = _kt('Not Available');
if (is_numeric($docsInQueue))
{
$queueCoverage = ($docsInQueue * 100) / $docsInRepository;
$queueCoverage = number_format($queueCoverage, 2, '.',',') . '%';
}
}
$stats = array(
'optimisationDate'=>$optimisationDate,
'optimisationPeriod'=>$optimisationPeriod,
'indexingDate'=>$indexingDate,
'indexingPeriod'=>$indexingPeriod,
'docsInIndex'=>$docsInIndex,
'docsInQueue'=>$docsInQueue,
'errorsInQueue'=>$errorsInQueue,
'docsInRepository'=>$docsInRepository,
'indexingCoverage'=>$indexingCoverage,
'queueCoverage'=>$queueCoverage,
'noOptimisation'=>$noOptimisation
);
KTUtil::setSystemSetting('indexerStats', serialize($stats));
$indexer = Indexer::get();
$diagnosis = $indexer->diagnose();
KTUtil::setSystemSetting('indexerDiagnostics', serialize($diagnosis));
$extractorDiagnosis = $indexer->diagnoseExtractors();
KTUtil::setSystemSetting('extractorDiagnostics', serialize($extractorDiagnosis));
}
/**
* The main function that may be called repeatedly to index documents.
*
* @param int $max Default 20
*/
public function indexDocuments($max=null)
{
global $default;
$config =& KTConfig::getSingleton();
/*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';
if (is_file($indexLockFile))
{
$default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');
$default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');
return;
}
touch($indexLockFile);*/
$this->checkForRegisteredTypes();
if ($this->debug) $default->log->debug('indexDocuments: start');
if (!$this->doesDiagnosticsPass())
{
//unlink($indexLockFile);
if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');
return;
}
if (is_null($max))
{
$max = $config->get('indexer/batchDocuments',20);
}
$this->loadExtractorHooks();
Indexer::clearoutDeleted();
$date = date('Y-m-d H:i:s');
// identify the indexers that must run
// mysql specific limit!
$sql = "SELECT
iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what
FROM
index_files iff
INNER JOIN documents d ON iff.document_id=d.id
INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
INNER JOIN mime_types mt ON dcv.mime_id=mt.id
LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
WHERE
(iff.processdate IS NULL or iff.processdate < date_sub('$date', interval 1 day)) AND dmv.status_id=1
ORDER BY indexdate
LIMIT $max";
$result = DBUtil::getResultArray($sql);
if (PEAR::isError($result))
{
//unlink($indexLockFile);
if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');
return;
}
KTUtil::setSystemSetting('luceneIndexingDate', time());
// bail if no work to do
if (count($result) == 0)
{
//unlink($indexLockFile);
if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');
return;
}
// identify any documents that need indexing and mark them
// so they are not taken in a followup run
$ids = array();
foreach($result as $docinfo)
{
$ids[] = $docinfo['document_id'];
}
// mark the documents as being processed
$ids=implode(',',$ids);
$sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";
DBUtil::runQuery($sql);
$extractorCache = array();
$storageManager = KTStorageManagerUtil::getSingleton();
$tempPath = $config->get("urls/tmpDirectory");
foreach($result as $docinfo)
{
// increment indexed documents count
Indexer::incrementCount();
$docId=$docinfo['document_id'];
$extension=$docinfo['filetypes'];
$mimeType=$docinfo['mimetypes'];
$extractorClass=$docinfo['extractor'];
$indexDocument = in_array($docinfo['what'], array('A','C'));
$indexDiscussion = in_array($docinfo['what'], array('A','D'));
$this->indexingHistory = '';
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');
if (empty($extractorClass))
{
/*
if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
*/
if ($indexDiscussion)
{
$indexDocument = false;
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
}
else
{
Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));
continue;
}
}
else
{
/*
If an extractor is available, we must ensure it is enabled.
*/
if (!$this->isExtractorEnabled($extractorClass))
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
continue;
}
}
if ($this->debug)
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');
}
$document = Document::get($docId);
if (PEAR::isError($document))
{
Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');
continue;
}
if ($this->restartCurrentBatch)
{
Indexer::unqueueDocument($docId);
Indexer::index($docId, 'A');
continue;
}
$filename = $document->getFileName();
if (substr($filename,0,1) == '~' || substr($filename,-1) == '~')
{
Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');
continue;
}
$removeFromQueue = true;
if ($indexDocument)
{
if (array_key_exists($extractorClass, $extractorCache))
{
$extractor = $extractorCache[$extractorClass];
}
else
{
$extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
}
if (!($extractor instanceof DocumentExtractor))
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');
continue;
}
$version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
$sourceFile = $storageManager->temporaryFile($document);
if (empty($sourceFile) || !is_file($sourceFile))
{
Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');
continue;
}
if ($extractor->needsIntermediateSourceFile())
{
//$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);
$intermediate = $tempPath . '/'. $docId . '.' . $extension;
$result = @copy($sourceFile, $intermediate);
if ($result === false)
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');
// problem. lets try again later. probably permission related. log the issue.
continue;
}
$sourceFile = $intermediate;
}
$targetFile = tempnam($tempPath, 'ktindexer');
$extractor->setSourceFile($sourceFile);
$extractor->setMimeType($mimeType);
$extractor->setExtension($extension);
$extractor->setTargetFile($targetFile);
$extractor->setDocument($document);
$extractor->setIndexingStatus(null);
$extractor->setExtractionStatus(null);
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');
$this->executeHook($extractor, 'pre_extract');
$this->executeHook($extractor, 'pre_extract', $mimeType);
$removeFromQueue = false;
if ($extractor->extractTextContent())
{
// the extractor may need to create another target file
$targetFile = $extractor->getTargetFile();
$extractor->setExtractionStatus(true);
$this->executeHook($extractor, 'pre_index');
$this->executeHook($extractor, 'pre_index', $mimeType);
$title = $document->getName();
if ($indexDiscussion)
{
if (!$this->filterText($targetFile))
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
}
else
{
$indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
$removeFromQueue = $indexStatus;
if (!$indexStatus)
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');
}
$extractor->setIndexingStatus($indexStatus);
}
}
else
{
if (!$this->filterText($targetFile))
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
}
else
{
$indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
$removeFromQueue = $indexStatus;
if (!$indexStatus)
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -