📄 extractorcore.inc.php.svn-base
字号:
* @return string */ protected function getCommandLine() { throw new Exception(_kt('getCommandLine is not implemented')); } /** * Executes the command that executes the command. * Returns true if success. * * @return boolean */ public function extractTextContent() { global $default; $cmdline = $this->getCommandLine(); $class = get_class($this); $default->log->debug("$class: " . $cmdline); return $this->exec($cmdline); }}abstract class OOFallbackDocumentExtractor extends ExternalDocumentExtractor{ protected $cmd; protected $params; /** * Enter description here... * * @var StarOfficeExtractor */ protected $oo; public function __construct($cmd, $params) { parent::__construct(); $this->cmd = KTUtil::findCommand('externalBinary/' . $cmd, false); $config = KTConfig::getSingleton(); $this->params = $config->get('indexer/' . $cmd . 'cmdline', $params); $this->useOO = $config->get('indexer/useOpenOffice', true); if (!$config->get('indexer/use_' . $cmd, true) || OS_WINDOWS) { $this->cmd = false; } if ($this->useOO) { require_once('extractors/StarOfficeExtractor.inc.php'); $this->oo = new StarOfficeExtractor(); } } public function needsIntermediateSourceFile() { // we need the intermediate file because it // has the correct extension. documentConverter uses the extension to determine mimetype return ($this->useOO); } protected function getCommandLine() { $sourcefile = $this->sourcefile; $targetfile = $this->targetfile; $escape = '"'; $cmd = $this->cmd; $cmdline = $this->params; $cmdline = eval("return \"$cmdline\";"); $cmdline = str_replace('\\','/',$cmdline); return $cmdline; } public function extractTextContent() { if ($this->cmd !== false) { // so we have catppt or something $result = parent::extractTextContent(); if ($result !== false) { // if it returns true, we can bail return true; } // if failure, fallthrough, and attempt OO } if ($this->useOO) { $this->oo->setSourceFile($this->sourcefile); $this->oo->setMimeType($this->mimetype); $this->oo->setExtension($this->extension); $this->oo->setTargetFile($this->targetfile); $this->oo->setDocument($this->document); $this->oo->setIndexingStatus(null); $this->oo->setExtractionStatus(null); $result = $this->oo->extractTextContent(); $this->setIndexingStatus($this->oo->getIndexingStatus()); $this->setExtractionStatus($this->oo->getExtractionStatus()); $this->setTargetFile($this->oo->getTargetFile()); return $result; } else { global $default; $docId = $this->document->getId(); $cmd = $this->cmd; $default->log->info("The document {$docId} cannot be indexed as {$cmd} is not available and OpenOffice is not in use."); file_put_contents($this->targetfile, ''); return true; } } public function diagnose() { if ($this->cmd !== false || !$this->useOO) { // cmd is found. we don't care about oo. // if we can't use oo, well, not much we can do.... return null; } return $this->oo->diagnose(); }}/** * An extension to the extenal document extractor. A derived class simply needs * to implement a constructor and getSupportedMimeTypes(). * */abstract class ApplicationExtractor extends ExternalDocumentExtractor{ /** * The full path to the application that will be run. This will be resolved from * the path or using the config file. * * @var string */ private $application; /** * The command name of the application that can be run. * * @var string */ private $command; /** * This is the friendly name for the extractor. * * @var string */ private $displayname; /** * The command line parameters for the application. * This may include {source} and {target} where substitutions will be done. * * @var string */ private $params; /** * Initialise the extractor. * * @param string $section The section in the config file. * @param string $appname The application name in the config file. * @param string $command The command that can be run. * @param string $displayname * @param string $params */ public function __construct($section, $appname, $command, $displayname, $params) { parent::__construct(); $this->application = KTUtil::findCommand("$section/$appname", $command); $this->command = $command; $this->displayname = $displayname; $this->params = $params; } /** * Return the display name. * * @return string */ public function getDisplayName() { return _kt($this->displayname); } /** * Returns the command line after performing substitutions. * * @return unknown */ protected function getCommandLine() { $sources = array('{source}','{target}'); $target = array($this->sourcefile, $this->targetfile); $escape = OS_WINDOWS?'"':'\''; $cmdline = $escape . $this->application . $escape . ' ' . str_replace($sources,$target, $this->params); return $cmdline; } /** * Identifies if there are any circumstances why the command can not run that could result in the text extraction process * failing. * * @return mixed Returns string if there is a problem, null otherwise. */ public function diagnose() { if (false === $this->application) { return sprintf(_kt("Cannot locate binary for %s (%s)."), $this->displayname, $this->command); } return null; }}abstract class TextExtractor extends DocumentExtractor{ /** * This extracts the text from the document. * * @return boolean */ public function extractTextContent() { $config = KTConfig::getSingleton(); $maxTextSize = $config->get('indexer/maxTextSize', 1024 * 1024 * 10); // we'll only take 10 meg by default $content = file_get_contents($this->sourcefile, null, null, null, $maxTextSize); if (false === $content) { return false; } $result = file_put_contents($this->targetfile, $this->filter($content)); return false !== $result; } /** * There are no external dependancies to diagnose. * * @return null */ public function diagnose() { return null; }}/** * The composite extractor implies that a conversion is done to an intermediate form before another extractor is run. * */abstract class CompositeExtractor extends DocumentExtractor{ /** * The initial extractor * * @var DocumentExtractor */ private $sourceExtractor; /** * The text extractor * * @var DocumentExtractor */ private $targetExtractor; /** * The extension for the initial extraction * * @var string */ private $targetExtension; /** * The mime type of the initial extraction. * * @var string */ private $targetMimeType; public function __construct($sourceExtractor, $targetExtension, $targetMimeType, $targetExtractor, $needsIntermediate) { $this->sourceExtractor = $sourceExtractor; $this->targetExtractor = $targetExtractor; $this->targetExtension = $targetExtension; $this->targetMimeType = $targetMimeType; $this->needsIntermediateSourceFile($needsIntermediate); } /** * Extracts the content of the document * * @return string */ public function extractTextContent() { $intermediateFile = $this->targetfile . '.' . $this->targetExtension; touch($intermediateFile); $this->sourceExtractor->setSourceFile($this->sourcefile); $this->sourceExtractor->setTargetFile($intermediateFile); $this->sourceExtractor->setDocument($this->getDocument()); $this->sourceExtractor->setMimeType($this->mimetype); $this->sourceExtractor->setExtension($this->extension); if (!$this->sourceExtractor->extractTextContent()) { $this->output = $this->sourceExtractor->output; @unlink($intermediateFile); return false; } $intermediateFile = $this->sourceExtractor->getTargetFile(); $this->targetExtractor->setSourceFile($intermediateFile); $this->targetExtractor->setTargetFile($this->targetfile); $this->targetExtractor->setDocument($this->getDocument()); $this->targetExtractor->setMimeType($this->targetMimeType); $this->targetExtractor->setExtension($this->targetExtension); $result = $this->targetExtractor->extractTextContent(); if (!$result) { $this->output = $this->targetExtractor->output; } @unlink($intermediateFile); $this->setTargetFile($this->targetExtractor->getTargetFile()); return $result; } /** * Diagnose the extractors * * @return mixed */ public function diagnose() { $diagnosis = $this->sourceExtractor->diagnose(); if (!empty($diagnosis)) { return $diagnosis; } $diagnosis = $this->targetExtractor->diagnose(); if (!empty($diagnosis)) { return $diagnosis; } return null; }}/** * The purpose of an extractor hook is to effect the * */abstract class ExtractorHook{ /** * Returns an array of supported mime types. * e.g. return array('plain/text'); * * * @return array * */ public abstract function getSupportedMimeTypes(); /** * Returns the friendly name for the hook. * * @return string */ public abstract function getDisplayName(); /** * This does a basic diagnosis on the hook. * * @return string */ public function diagnose() { return null; } /** * Perform any pre extraction activities. * * @param DocumentExtractor $extractor */ public function pre_extract($extractor) { } /** * Perform any post extraction activities. * * @param DocumentExtractor $extractor */ public function post_extract($extractor) { } /** * Perform any pre indexing activities. * * @param DocumentExtractor $extractor */ public function pre_index($extractor) { } /** * Perform any post indexing activities. * * @param DocumentExtractor $extractor */ public function post_index($extractor) { }}?>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -