📄 extractorcore.inc.php.svn-base
字号:
<?php/** * $Id:$ * * KnowledgeTree Community Edition * Document Management Made Simple * Copyright (C) 2008 KnowledgeTree Inc. * Portions copyright The Jam Warehouse Software (Pty) Limited * * This program is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License version 3 as published by the * Free Software Foundation. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco, * California 94120-7775, or email info@knowledgetree.com. * * The interactive user interfaces in modified source and object code versions * of this program must display Appropriate Legal Notices, as required under * Section 5 of the GNU General Public License version 3. * * In accordance with Section 7(b) of the GNU General Public License version 3, * these Appropriate Legal Notices must retain the display of the "Powered by * KnowledgeTree" logo and retain the original copyright notice. If the display of the * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices * must display the words "Powered by KnowledgeTree" and retain the original * copyright notice. * Contributor( s): ______________________________________ * *//** * DocumentExtractor is the base class for all text extractors. * */abstract class DocumentExtractor{ /** * The source filename from which to extract text. * * @var string */ protected $sourcefile; /** * The target filename, where the extracted text must be stored. * * @var string */ protected $targetfile; /** * The mime type of the source file. * * @var string */ protected $mimetype; /** * The extension of the source file. * * @var string */ protected $extension; /** * Reference to the document being indexed. * * @var Document */ protected $document; /** * Indicates if the extractor needs an intermediate file or not. * Generally the source file will be a file within the respository itself. Some extractors may * require the source file to have the correct extension. Setting this to true will result in * a file being created with the extension of the file. It is ideal to disable this if possible. * * @var boolean */ protected $needsIntermediate; /** * The status of the extraction. If null, the extraction has not been done yet. * * @var boolean */ protected $extractionStatus; /** * The status of the indexing. If null, the indexing has not been done yet. * * @var boolean */ protected $indexStatus; /** * If an error occurred, this is the output that was captured * * @var string */ public $output; public function __construct() { $this->needsIntermediate=false; $this->extractionStatus = null; $this->indexStatus = null; } /** * Sets the status of the indexing. * * @param unknown_type $status */ public function setIndexingStatus($status) { $this->indexStatus = $status; } /** * Returns the indexing status. * * @return boolean */ public function getIndexingStatus() { return $this->indexStatus; } /** * Sets the extraction status. * * @param boolean $status */ public function setExtractionStatus($status) { $this->extractionStatus = $status; } /** * Return the extraction status. * * @return boolean */ public function getExtractionStatus() { return $this->extractionStatus; } /** * This associates all the mime types associated with the extractor class. * */ public function registerMimeTypes() { $types = $this->getSupportedMimeTypes(); if (empty($types)) { return; } $classname=get_class($this); $sql = "select id as extractor_id from mime_extractors WHERE name='$classname'"; $rs = DBUtil::getResultArray($sql); if (count($rs) == 0) { $extractor_id = DBUtil::autoInsert('mime_extractors', array('name'=>$classname, 'active'=>1)); } else { $extractor_id = $rs[0]['extractor_id']; } foreach($types as $type) { $sql = "update mime_types set extractor_id=$extractor_id where mimetypes='$type' and extractor_id is null"; $rs = DBUtil::runQuery($sql); } } /** * Indicates if an intermediate file is required. * * @param $value boolean Optional. If set, we set the value. * @return boolean */ public function needsIntermediateSourceFile($value = null) { if (!is_null($value)) { $this->needsIntermediate = $value; } return $this->needsIntermediate; } /** * Sets the source filename for the document extractor. * * @param string $sourcefile */ public function setSourceFile($sourcefile) { $this->sourcefile=$sourcefile; } /** * Returns the source file name. * * @return string */ public function getSourceFile() { return $this->sourcefile; } /** * Sets the source file's mime type. * * @param string $mimetype */ public function setMimeType($mimetype) { $this->mimetype=$mimetype; } /** * Returns the mime type for the source file. * * @return string */ public function getMimeType() { return $this->mimetype; } /** * Indicates the extension for the source file. * * @param string $extension */ public function setExtension($extension) { $this->extension=$extension; } /** * Returns the extension of the source file. * * @return string */ public function getExtension() { return $this->extension; } /** * Sets the file name of the target text file. * * @param string $targetfile */ public function setTargetFile($targetfile) { $this->targetfile=$targetfile; } /** * Gets the file name of the target text file containing the extracted text. * * @return unknown */ public function getTargetFile() { return $this->targetfile; } /** * Filter function that may be applied after extraction. This may be overridden. * * @param string $text * @return string */ protected function filter($text) { return $text; } /** * Set the document that will be indexed. * * @param Document $document */ public function setDocument($document) { $this->document = $document; } /** * Returns a reference to the document. * * @return string */ public function getDocument() { return $this->document; } /** * Returns an array of supported mime types. * e.g. return array('plain/text'); * * * @return array * */ public abstract function getSupportedMimeTypes(); /** * Extracts the content from the source file. * * @return boolean */ public abstract function extractTextContent(); /** * Returns a friendly name for the document text extractor. * * @return string */ public abstract function getDisplayName(); /** * Attempts to diagnose any problems with the indexing process. * * @return string */ public abstract function diagnose();}/** * This class extends the document extractor to execute some command line application. * The getCommandLine() method needs to be overridden. * */abstract class ExternalDocumentExtractor extends DocumentExtractor{ protected $allowOutput = false; protected $pipeStdoutToDevNull = false; /** * Initialise the extractor. * */ public function __construct() { parent::__construct(); putenv('LANG=en_US.UTF-8'); $config = KTConfig::getSingleton(); $default = realpath(str_replace('\\','/',KT_DIR . '/../openoffice/program')); putenv('ooProgramPath=' . $config->get('openoffice/programPath', $default)); } public function setAllowOutput($allowOutput) { $this->allowOutput = $allowOutput; } /** * Executes a command. Returns true if successful. * * @param string $cmd A command line instruction. * @return boolean */ protected function exec($cmd) { $config = KTConfig::getSingleton(); $temp_dir = $config->get('urls/tmpDirectory'); $res = 0; $docid = $this->document->getId(); $script_prefix = $temp_dir . '/' . time() . '-' . $docid; $script_out = $script_prefix . '.out'; // define the scripts that we want if (OS_WINDOWS) { $script_name = $script_prefix . '.bat'; $script = "rem This is an auto generated file. \n"; $script .= $cmd . ' 2>"' . $script_out . "\"\r\n"; $script .= "set er=%ERRORLEVEL%\r\n"; $script .= "exit /B %er%\r\n"; } else { $script_name = $script_prefix . '.sh'; $script = "#!/bin/sh\n"; $script .= "# This is an auto generated file. \n"; $script .= $cmd . ' 2>>"' . $script_out . "\""; if ($this->pipeStdoutToDevNull) { $script .= " >/dev/null"; } $script .= "\n"; $script .= "exit $?\n"; } // write the script file if (file_put_contents($script_name, $script) === false) { $this->output = _kt('Could not create exec script: ') . $script_name; return false; } // execute the script file if (OS_WINDOWS) { $res = KTUtil::pexec("\"$script_name\""); $res = $res['ret']; } else { if (chmod($script_name, 0755) === false) { $this->output = _kt('Could change permission on exec script: ') . $script_name; return false; } system($script_name, $res); } // remote the script file and get the output if available @unlink($script_name); if (file_exists($script_out)) { $this->output = file_get_contents($script_out); @unlink($script_out); } return ($res == 0) && (empty($this->output) || $this->allowOutput); } /** * Returns the command line string to be executed. * The command returned should include the target filename. *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -