extractorcore.inc.php
来自「PHP 知识管理系统(基于树结构的知识管理系统), 英文原版的PHP源码。」· PHP 代码 · 共 899 行 · 第 1/2 页
PHP
899 行
<?php
/**
* $Id:$
*
* KnowledgeTree Community Edition
* Document Management Made Simple
* Copyright (C) 2008 KnowledgeTree Inc.
* Portions copyright The Jam Warehouse Software (Pty) Limited
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License version 3 as published by the
* Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
* California 94120-7775, or email info@knowledgetree.com.
*
* The interactive user interfaces in modified source and object code versions
* of this program must display Appropriate Legal Notices, as required under
* Section 5 of the GNU General Public License version 3.
*
* In accordance with Section 7(b) of the GNU General Public License version 3,
* these Appropriate Legal Notices must retain the display of the "Powered by
* KnowledgeTree" logo and retain the original copyright notice. If the display of the
* logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
* must display the words "Powered by KnowledgeTree" and retain the original
* copyright notice.
* Contributor( s): ______________________________________
*
*/
/**
* DocumentExtractor is the base class for all text extractors.
*
*/
abstract class DocumentExtractor
{
/**
* The source filename from which to extract text.
*
* @var string
*/
protected $sourcefile;
/**
* The target filename, where the extracted text must be stored.
*
* @var string
*/
protected $targetfile;
/**
* The mime type of the source file.
*
* @var string
*/
protected $mimetype;
/**
* The extension of the source file.
*
* @var string
*/
protected $extension;
/**
* Reference to the document being indexed.
*
* @var Document
*/
protected $document;
/**
* Indicates if the extractor needs an intermediate file or not.
* Generally the source file will be a file within the respository itself. Some extractors may
* require the source file to have the correct extension. Setting this to true will result in
* a file being created with the extension of the file. It is ideal to disable this if possible.
*
* @var boolean
*/
protected $needsIntermediate;
/**
* The status of the extraction. If null, the extraction has not been done yet.
*
* @var boolean
*/
protected $extractionStatus;
/**
* The status of the indexing. If null, the indexing has not been done yet.
*
* @var boolean
*/
protected $indexStatus;
/**
* If an error occurred, this is the output that was captured
*
* @var string
*/
public $output;
public function __construct()
{
$this->needsIntermediate=false;
$this->extractionStatus = null;
$this->indexStatus = null;
}
/**
* Sets the status of the indexing.
*
* @param unknown_type $status
*/
public function setIndexingStatus($status)
{
$this->indexStatus = $status;
}
/**
* Returns the indexing status.
*
* @return boolean
*/
public function getIndexingStatus()
{
return $this->indexStatus;
}
/**
* Sets the extraction status.
*
* @param boolean $status
*/
public function setExtractionStatus($status)
{
$this->extractionStatus = $status;
}
/**
* Return the extraction status.
*
* @return boolean
*/
public function getExtractionStatus()
{
return $this->extractionStatus;
}
/**
* This associates all the mime types associated with the extractor class.
*
*/
public function registerMimeTypes()
{
$types = $this->getSupportedMimeTypes();
if (empty($types))
{
return;
}
$classname=get_class($this);
$sql = "select id as extractor_id from mime_extractors WHERE name='$classname'";
$rs = DBUtil::getResultArray($sql);
if (count($rs) == 0)
{
$extractor_id = DBUtil::autoInsert('mime_extractors', array('name'=>$classname, 'active'=>1));
}
else
{
$extractor_id = $rs[0]['extractor_id'];
}
foreach($types as $type)
{
$sql = "update mime_types set extractor_id=$extractor_id where mimetypes='$type' and extractor_id is null";
$rs = DBUtil::runQuery($sql);
}
}
/**
* Indicates if an intermediate file is required.
*
* @param $value boolean Optional. If set, we set the value.
* @return boolean
*/
public function needsIntermediateSourceFile($value = null)
{
if (!is_null($value))
{
$this->needsIntermediate = $value;
}
return $this->needsIntermediate;
}
/**
* Sets the source filename for the document extractor.
*
* @param string $sourcefile
*/
public function setSourceFile($sourcefile)
{
$this->sourcefile=$sourcefile;
}
/**
* Returns the source file name.
*
* @return string
*/
public function getSourceFile() { return $this->sourcefile; }
/**
* Sets the source file's mime type.
*
* @param string $mimetype
*/
public function setMimeType($mimetype)
{
$this->mimetype=$mimetype;
}
/**
* Returns the mime type for the source file.
*
* @return string
*/
public function getMimeType() { return $this->mimetype; }
/**
* Indicates the extension for the source file.
*
* @param string $extension
*/
public function setExtension($extension)
{
$this->extension=$extension;
}
/**
* Returns the extension of the source file.
*
* @return string
*/
public function getExtension() { return $this->extension; }
/**
* Sets the file name of the target text file.
*
* @param string $targetfile
*/
public function setTargetFile($targetfile)
{
$this->targetfile=$targetfile;
}
/**
* Gets the file name of the target text file containing the extracted text.
*
* @return unknown
*/
public function getTargetFile() { return $this->targetfile; }
/**
* Filter function that may be applied after extraction. This may be overridden.
*
* @param string $text
* @return string
*/
protected function filter($text)
{
return $text;
}
/**
* Set the document that will be indexed.
*
* @param Document $document
*/
public function setDocument($document)
{
$this->document = $document;
}
/**
* Returns a reference to the document.
*
* @return string
*/
public function getDocument()
{
return $this->document;
}
/**
* Returns an array of supported mime types.
* e.g. return array('plain/text');
*
*
* @return array
*
*/
public abstract function getSupportedMimeTypes();
/**
* Extracts the content from the source file.
*
* @return boolean
*/
public abstract function extractTextContent();
/**
* Returns a friendly name for the document text extractor.
*
* @return string
*/
public abstract function getDisplayName();
/**
* Attempts to diagnose any problems with the indexing process.
*
* @return string
*/
public abstract function diagnose();
}
/**
* This class extends the document extractor to execute some command line application.
* The getCommandLine() method needs to be overridden.
*
*/
abstract class ExternalDocumentExtractor extends DocumentExtractor
{
protected $allowOutput = false;
protected $pipeStdoutToDevNull = false;
/**
* Initialise the extractor.
*
*/
public function __construct()
{
parent::__construct();
putenv('LANG=en_US.UTF-8');
$config = KTConfig::getSingleton();
$default = realpath(str_replace('\\','/',KT_DIR . '/../openoffice/program'));
putenv('ooProgramPath=' . $config->get('openoffice/programPath', $default));
}
public function setAllowOutput($allowOutput)
{
$this->allowOutput = $allowOutput;
}
/**
* Executes a command. Returns true if successful.
*
* @param string $cmd A command line instruction.
* @return boolean
*/
protected function exec($cmd)
{
$config = KTConfig::getSingleton();
$temp_dir = $config->get('urls/tmpDirectory');
$res = 0;
$docid = $this->document->getId();
$script_prefix = $temp_dir . '/' . time() . '-' . $docid;
$script_out = $script_prefix . '.out';
// define the scripts that we want
if (OS_WINDOWS)
{
$script_name = $script_prefix . '.bat';
$script = "rem This is an auto generated file. \n";
$script .= $cmd . ' 2>"' . $script_out . "\"\r\n";
$script .= "set er=%ERRORLEVEL%\r\n";
$script .= "exit /B %er%\r\n";
}
else
{
$script_name = $script_prefix . '.sh';
$script = "#!/bin/sh\n";
$script .= "# This is an auto generated file. \n";
$script .= $cmd . ' 2>>"' . $script_out . "\"";
if ($this->pipeStdoutToDevNull)
{
$script .= " >/dev/null";
}
$script .= "\n";
$script .= "exit $?\n";
}
// write the script file
if (file_put_contents($script_name, $script) === false)
{
$this->output = _kt('Could not create exec script: ') . $script_name;
return false;
}
// execute the script file
if (OS_WINDOWS)
{
$res = KTUtil::pexec("\"$script_name\"");
$res = $res['ret'];
}
else
{
if (chmod($script_name, 0755) === false)
{
$this->output = _kt('Could change permission on exec script: ') . $script_name;
return false;
}
system($script_name, $res);
}
// remote the script file and get the output if available
@unlink($script_name);
if (file_exists($script_out))
{
$this->output = file_get_contents($script_out);
@unlink($script_out);
}
return ($res == 0) && (empty($this->output) || $this->allowOutput);
}
/**
* Returns the command line string to be executed.
* The command returned should include the target filename.
*
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?