📄 indexercore.inc.php.svn-base
字号:
public function restartBatch() { $this->restartCurrentBatch = true; } /** * * @param int $documentId * @param string $message * @param string $level This may be info, error, debug */ private function logPendingDocumentInfoStatus($documentId, $message, $level) { $this->updatePendingDocumentStatus($documentId, $message, $level); global $default; switch ($level) { case 'debug': if ($this->debug) { $default->log->debug($message); } break; default: $default->log->$level($message); } } public function getExtractor($extractorClass) { if (empty($extractorClass)) { return null; } $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php'; if (!file_exists($includeFile)) { throw new Exception("Extractor file does not exist: $includeFile"); } require_once($includeFile); if (!class_exists($extractorClass)) { throw new Exception("Extractor '$classname' not defined in file: $includeFile"); } $extractor = new $extractorClass(); if (!($extractor instanceof DocumentExtractor)) { throw new Exception("Class $classname was expected to be of type DocumentExtractor"); } return $extractor; } public static function getIndexingQueue($problemItemsOnly=true) { if ($problemItemsOnly) { $sql = "SELECT iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename FROM index_files iff INNER JOIN documents d ON iff.document_id=d.id INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id INNER JOIN mime_types mt ON dcv.mime_id=mt.id LEFT JOIN mime_extractors me ON mt.extractor_id=me.id WHERE (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1 ORDER BY indexdate "; } else { $sql = "SELECT iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename FROM index_files iff INNER JOIN documents d ON iff.document_id=d.id INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id INNER JOIN mime_types mt ON dcv.mime_id=mt.id LEFT JOIN mime_extractors me ON mt.extractor_id=me.id WHERE (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1 ORDER BY indexdate "; } $aResult = DBUtil::getResultArray($sql); return $aResult; } public static function getPendingIndexingQueue() { return Indexer::getIndexingQueue(false); } public function updateIndexStats() { $optimisationDate = KTUtil::getSystemSetting('luceneOptimisationDate', ''); $noOptimisation = false; if ($optimisationDate == '') { $optimisationDate = _kt('N/A'); $optimisationPeriod = $optimisationDate; } else { $optimisationPeriod = KTUtil::computePeriodToDate($optimisationDate, null, true); $noOptimisation = $optimisationPeriod['days'] > 2; $optimisationPeriod = $optimisationPeriod['str']; $optimisationDate = date('Y-m-d H:i:s', $optimisationDate); } $indexingDate = KTUtil::getSystemSetting('luceneIndexingDate', ''); if ($indexingDate == '') { $indexingDate = _kt('N/A'); $indexingPeriod = $indexingDate; } else { $indexingPeriod = KTUtil::computePeriodToDate($indexingDate); $indexingDate = date('Y-m-d H:i:s', $indexingDate); } $index = Indexer::get(); $docsInIndex = $index->getDocumentsInIndex(); // we are only interested in documents that are active $sql = "SELECT count(*) as docsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is null or i.status_msg = '') and d.status_id=1"; $docsInQueue = DBUtil::getOneResultKey($sql, 'docsInQueue'); $sql = "SELECT count(*) as errorsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is not null or i.status_msg <> '') and d.status_id=1"; $errorsInQueue = DBUtil::getOneResultKey($sql, 'errorsInQueue'); $sql = "SELECT count(*) as docsInRepository FROM documents where status_id=1"; $docsInRepository = DBUtil::getOneResultKey($sql, 'docsInRepository'); if ($docsInRepository == 0) { $indexingCoverage = '0.00%'; $queueCoverage = $indexingCoverage; } else { // compute indexing coverage $indexingCoverage = _kt('Not Available'); if (is_numeric($docsInIndex)) { $indexingCoverage = ($docsInIndex * 100) / $docsInRepository; $indexingCoverage = number_format($indexingCoverage, 2, '.',',') . '%'; } // compute queue coverage $queueCoverage = _kt('Not Available'); if (is_numeric($docsInQueue)) { $queueCoverage = ($docsInQueue * 100) / $docsInRepository; $queueCoverage = number_format($queueCoverage, 2, '.',',') . '%'; } } $stats = array( 'optimisationDate'=>$optimisationDate, 'optimisationPeriod'=>$optimisationPeriod, 'indexingDate'=>$indexingDate, 'indexingPeriod'=>$indexingPeriod, 'docsInIndex'=>$docsInIndex, 'docsInQueue'=>$docsInQueue, 'errorsInQueue'=>$errorsInQueue, 'docsInRepository'=>$docsInRepository, 'indexingCoverage'=>$indexingCoverage, 'queueCoverage'=>$queueCoverage, 'noOptimisation'=>$noOptimisation ); KTUtil::setSystemSetting('indexerStats', serialize($stats)); $indexer = Indexer::get(); $diagnosis = $indexer->diagnose(); KTUtil::setSystemSetting('indexerDiagnostics', serialize($diagnosis)); $extractorDiagnosis = $indexer->diagnoseExtractors(); KTUtil::setSystemSetting('extractorDiagnostics', serialize($extractorDiagnosis)); } /** * The main function that may be called repeatedly to index documents. * * @param int $max Default 20 */ public function indexDocuments($max=null) { global $default; $config =& KTConfig::getSingleton(); /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock'; if (is_file($indexLockFile)) { $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.'); $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!'); return; } touch($indexLockFile);*/ $this->checkForRegisteredTypes(); if ($this->debug) $default->log->debug('indexDocuments: start'); if (!$this->doesDiagnosticsPass()) { //unlink($indexLockFile); if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.'); return; } if (is_null($max)) { $max = $config->get('indexer/batchDocuments',20); } $this->loadExtractorHooks(); Indexer::clearoutDeleted(); $date = date('Y-m-d H:i:s'); // identify the indexers that must run // mysql specific limit! $sql = "SELECT iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what FROM index_files iff INNER JOIN documents d ON iff.document_id=d.id INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id INNER JOIN mime_types mt ON dcv.mime_id=mt.id LEFT JOIN mime_extractors me ON mt.extractor_id=me.id WHERE (iff.processdate IS NULL or iff.processdate < date_sub('$date', interval 1 day)) AND dmv.status_id=1 ORDER BY indexdate LIMIT $max"; $result = DBUtil::getResultArray($sql); if (PEAR::isError($result)) { //unlink($indexLockFile); if ($this->debug) $default->log->debug('indexDocuments: stopping - db error'); return; } KTUtil::setSystemSetting('luceneIndexingDate', time()); // bail if no work to do if (count($result) == 0) { //unlink($indexLockFile); if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done'); return; } // identify any documents that need indexing and mark them // so they are not taken in a followup run $ids = array(); foreach($result as $docinfo) { $ids[] = $docinfo['document_id']; } // mark the documents as being processed $ids=implode(',',$ids); $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)"; DBUtil::runQuery($sql); $extractorCache = array(); $storageManager = KTStorageManagerUtil::getSingleton(); $tempPath = $config->get("urls/tmpDirectory"); foreach($result as $docinfo) { // increment indexed documents count Indexer::incrementCount(); $docId=$docinfo['document_id']; $extension=$docinfo['filetypes']; $mimeType=$docinfo['mimetypes']; $extractorClass=$docinfo['extractor']; $indexDocument = in_array($docinfo['what'], array('A','C')); $indexDiscussion = in_array($docinfo['what'], array('A','D')); $this->indexingHistory = ''; $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug'); if (empty($extractorClass)) { /* if no extractor is found and we don't need to index discussions, then we can remove the item from the queue. */ if ($indexDiscussion) { $indexDocument = false; $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info'); } else { Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId)); continue; } } else { /* If an extractor is available, we must ensure it is enabled. */ if (!$this->isExtractorEnabled($extractorClass)) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info'); continue; } } if ($this->debug) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info'); } $document = Document::get($docId); if (PEAR::isError($document)) { Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error'); continue; } if ($this->restartCurrentBatch) { Indexer::unqueueDocument($docId); Indexer::index($docId, 'A'); continue; } $filename = $document->getFileName(); if (substr($filename,0,1) == '~' || substr($filename,-1) == '~') { Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error'); continue; } $removeFromQueue = true; if ($indexDocument) { if (array_key_exists($extractorClass, $extractorCache)) { $extractor = $extractorCache[$extractorClass]; } else { $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass); } if (!($extractor instanceof DocumentExtractor)) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error'); continue; } $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber(); $sourceFile = $storageManager->temporaryFile($document); if (empty($sourceFile) || !is_file($sourceFile)) { Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error'); continue; } if ($extractor->needsIntermediateSourceFile()) { //$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION); $intermediate = $tempPath . '/'. $docId . '.' . $extension; $result = @copy($sourceFile, $intermediate); if ($result === false) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error'); // problem. lets try again later. probably permission related. log the issue. continue; } $sourceFile = $intermediate; } $targetFile = tempnam($tempPath, 'ktindexer'); $extractor->setSourceFile($sourceFile); $extractor->setMimeType($mimeType); $extractor->setExtension($extension); $extractor->setTargetFile($targetFile); $extractor->setDocument($document); $extractor->setIndexingStatus(null); $extractor->setExtractionStatus(null); $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug'); $this->executeHook($extractor, 'pre_extract'); $this->executeHook($extractor, 'pre_extract', $mimeType); $removeFromQueue = false; if ($extractor->extractTextContent()) { // the extractor may need to create another target file $targetFile = $extractor->getTargetFile(); $extractor->setExtractionStatus(true); $this->executeHook($extractor, 'pre_index'); $this->executeHook($extractor, 'pre_index', $mimeType); $title = $document->getName(); if ($indexDiscussion) { if (!$this->filterText($targetFile)) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error'); } else { $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version); $removeFromQueue = $indexStatus; if (!$indexStatus) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error'); } $extractor->setIndexingStatus($indexStatus); } } else { if (!$this->filterText($targetFile)) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error'); } else { $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version); $removeFromQueue = $indexStatus; if (!$indexStatus) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -