extractorcore.inc.php
来自「PHP 知识管理系统(基于树结构的知识管理系统), 英文原版的PHP源码。」· PHP 代码 · 共 899 行 · 第 1/2 页
PHP
899 行
* @return string
*/
protected function getCommandLine()
{
throw new Exception(_kt('getCommandLine is not implemented'));
}
/**
* Executes the command that executes the command.
* Returns true if success.
*
* @return boolean
*/
public function extractTextContent()
{
global $default;
$cmdline = $this->getCommandLine();
$class = get_class($this);
$default->log->debug("$class: " . $cmdline);
return $this->exec($cmdline);
}
}
abstract class OOFallbackDocumentExtractor extends ExternalDocumentExtractor
{
protected $cmd;
protected $params;
/**
* Enter description here...
*
* @var StarOfficeExtractor
*/
protected $oo;
public function __construct($cmd, $params)
{
parent::__construct();
$this->cmd = KTUtil::findCommand('externalBinary/' . $cmd, false);
$config = KTConfig::getSingleton();
$this->params = $config->get('indexer/' . $cmd . 'cmdline', $params);
$this->useOO = $config->get('indexer/useOpenOffice', true);
if (!$config->get('indexer/use_' . $cmd, true) || OS_WINDOWS)
{
$this->cmd = false;
}
if ($this->useOO)
{
require_once('extractors/StarOfficeExtractor.inc.php');
$this->oo = new StarOfficeExtractor();
}
}
public function needsIntermediateSourceFile()
{
// we need the intermediate file because it
// has the correct extension. documentConverter uses the extension to determine mimetype
return ($this->useOO);
}
protected function getCommandLine()
{
$sourcefile = $this->sourcefile;
$targetfile = $this->targetfile;
$escape = '"';
$cmd = $this->cmd;
$cmdline = $this->params;
$cmdline = eval("return \"$cmdline\";");
$cmdline = str_replace('\\','/',$cmdline);
return $cmdline;
}
public function extractTextContent()
{
if ($this->cmd !== false)
{
// so we have catppt or something
$result = parent::extractTextContent();
if ($result !== false)
{
// if it returns true, we can bail
return true;
}
// if failure, fallthrough, and attempt OO
}
if ($this->useOO)
{
$this->oo->setSourceFile($this->sourcefile);
$this->oo->setMimeType($this->mimetype);
$this->oo->setExtension($this->extension);
$this->oo->setTargetFile($this->targetfile);
$this->oo->setDocument($this->document);
$this->oo->setIndexingStatus(null);
$this->oo->setExtractionStatus(null);
$result = $this->oo->extractTextContent();
$this->setIndexingStatus($this->oo->getIndexingStatus());
$this->setExtractionStatus($this->oo->getExtractionStatus());
$this->setTargetFile($this->oo->getTargetFile());
return $result;
}
else
{
global $default;
$docId = $this->document->getId();
$cmd = $this->cmd;
$default->log->info("The document {$docId} cannot be indexed as {$cmd} is not available and OpenOffice is not in use.");
file_put_contents($this->targetfile, '');
return true;
}
}
public function diagnose()
{
if ($this->cmd !== false || !$this->useOO)
{
// cmd is found. we don't care about oo.
// if we can't use oo, well, not much we can do....
return null;
}
return $this->oo->diagnose();
}
}
/**
* An extension to the extenal document extractor. A derived class simply needs
* to implement a constructor and getSupportedMimeTypes().
*
*/
abstract class ApplicationExtractor extends ExternalDocumentExtractor
{
/**
* The full path to the application that will be run. This will be resolved from
* the path or using the config file.
*
* @var string
*/
private $application;
/**
* The command name of the application that can be run.
*
* @var string
*/
private $command;
/**
* This is the friendly name for the extractor.
*
* @var string
*/
private $displayname;
/**
* The command line parameters for the application.
* This may include {source} and {target} where substitutions will be done.
*
* @var string
*/
private $params;
/**
* Initialise the extractor.
*
* @param string $section The section in the config file.
* @param string $appname The application name in the config file.
* @param string $command The command that can be run.
* @param string $displayname
* @param string $params
*/
public function __construct($section, $appname, $command, $displayname, $params)
{
parent::__construct();
$this->application = KTUtil::findCommand("$section/$appname", $command);
$this->command = $command;
$this->displayname = $displayname;
$this->params = $params;
}
/**
* Return the display name.
*
* @return string
*/
public function getDisplayName()
{
return _kt($this->displayname);
}
/**
* Returns the command line after performing substitutions.
*
* @return unknown
*/
protected function getCommandLine()
{
$sources = array('{source}','{target}');
$target = array($this->sourcefile, $this->targetfile);
$escape = OS_WINDOWS?'"':'\'';
$cmdline = $escape . $this->application . $escape . ' ' . str_replace($sources,$target, $this->params);
return $cmdline;
}
/**
* Identifies if there are any circumstances why the command can not run that could result in the text extraction process
* failing.
*
* @return mixed Returns string if there is a problem, null otherwise.
*/
public function diagnose()
{
if (false === $this->application)
{
return sprintf(_kt("Cannot locate binary for %s (%s)."), $this->displayname, $this->command);
}
return null;
}
}
abstract class TextExtractor extends DocumentExtractor
{
/**
* This extracts the text from the document.
*
* @return boolean
*/
public function extractTextContent()
{
$config = KTConfig::getSingleton();
$maxTextSize = $config->get('indexer/maxTextSize', 1024 * 1024 * 10); // we'll only take 10 meg by default
$content = file_get_contents($this->sourcefile, null, null, null, $maxTextSize);
if (false === $content)
{
return false;
}
$result = file_put_contents($this->targetfile, $this->filter($content));
return false !== $result;
}
/**
* There are no external dependancies to diagnose.
*
* @return null
*/
public function diagnose()
{
return null;
}
}
/**
* The composite extractor implies that a conversion is done to an intermediate form before another extractor is run.
*
*/
abstract class CompositeExtractor extends DocumentExtractor
{
/**
* The initial extractor
*
* @var DocumentExtractor
*/
private $sourceExtractor;
/**
* The text extractor
*
* @var DocumentExtractor
*/
private $targetExtractor;
/**
* The extension for the initial extraction
*
* @var string
*/
private $targetExtension;
/**
* The mime type of the initial extraction.
*
* @var string
*/
private $targetMimeType;
public function __construct($sourceExtractor, $targetExtension, $targetMimeType, $targetExtractor, $needsIntermediate)
{
$this->sourceExtractor = $sourceExtractor;
$this->targetExtractor = $targetExtractor;
$this->targetExtension = $targetExtension;
$this->targetMimeType = $targetMimeType;
$this->needsIntermediateSourceFile($needsIntermediate);
}
/**
* Extracts the content of the document
*
* @return string
*/
public function extractTextContent()
{
$intermediateFile = $this->targetfile . '.' . $this->targetExtension;
touch($intermediateFile);
$this->sourceExtractor->setSourceFile($this->sourcefile);
$this->sourceExtractor->setTargetFile($intermediateFile);
$this->sourceExtractor->setDocument($this->getDocument());
$this->sourceExtractor->setMimeType($this->mimetype);
$this->sourceExtractor->setExtension($this->extension);
if (!$this->sourceExtractor->extractTextContent())
{
$this->output = $this->sourceExtractor->output;
@unlink($intermediateFile);
return false;
}
$intermediateFile = $this->sourceExtractor->getTargetFile();
$this->targetExtractor->setSourceFile($intermediateFile);
$this->targetExtractor->setTargetFile($this->targetfile);
$this->targetExtractor->setDocument($this->getDocument());
$this->targetExtractor->setMimeType($this->targetMimeType);
$this->targetExtractor->setExtension($this->targetExtension);
$result = $this->targetExtractor->extractTextContent();
if (!$result)
{
$this->output = $this->targetExtractor->output;
}
@unlink($intermediateFile);
$this->setTargetFile($this->targetExtractor->getTargetFile());
return $result;
}
/**
* Diagnose the extractors
*
* @return mixed
*/
public function diagnose()
{
$diagnosis = $this->sourceExtractor->diagnose();
if (!empty($diagnosis))
{
return $diagnosis;
}
$diagnosis = $this->targetExtractor->diagnose();
if (!empty($diagnosis))
{
return $diagnosis;
}
return null;
}
}
/**
* The purpose of an extractor hook is to effect the
*
*/
abstract class ExtractorHook
{
/**
* Returns an array of supported mime types.
* e.g. return array('plain/text');
*
*
* @return array
*
*/
public abstract function getSupportedMimeTypes();
/**
* Returns the friendly name for the hook.
*
* @return string
*/
public abstract function getDisplayName();
/**
* This does a basic diagnosis on the hook.
*
* @return string
*/
public function diagnose()
{
return null;
}
/**
* Perform any pre extraction activities.
*
* @param DocumentExtractor $extractor
*/
public function pre_extract($extractor)
{
}
/**
* Perform any post extraction activities.
*
* @param DocumentExtractor $extractor
*/
public function post_extract($extractor)
{
}
/**
* Perform any pre indexing activities.
*
* @param DocumentExtractor $extractor
*/
public function pre_index($extractor)
{
}
/**
* Perform any post indexing activities.
*
* @param DocumentExtractor $extractor
*/
public function post_index($extractor)
{
}
}
?>
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?