extractorcore.inc.php

来自「PHP 知识管理系统(基于树结构的知识管理系统), 英文原版的PHP源码。」· PHP 代码 · 共 899 行 · 第 1/2 页

PHP
899
字号
<?php

/**
 * $Id:$
 *
 * KnowledgeTree Community Edition
 * Document Management Made Simple
 * Copyright (C) 2008 KnowledgeTree Inc.
 * Portions copyright The Jam Warehouse Software (Pty) Limited
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License version 3 as published by the
 * Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
 * California 94120-7775, or email info@knowledgetree.com.
 *
 * The interactive user interfaces in modified source and object code versions
 * of this program must display Appropriate Legal Notices, as required under
 * Section 5 of the GNU General Public License version 3.
 *
 * In accordance with Section 7(b) of the GNU General Public License version 3,
 * these Appropriate Legal Notices must retain the display of the "Powered by
 * KnowledgeTree" logo and retain the original copyright notice. If the display of the
 * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
 * must display the words "Powered by KnowledgeTree" and retain the original
 * copyright notice.
 * Contributor( s): ______________________________________
 *
 */

/**
 * DocumentExtractor is the base class for all text extractors.
 *
 */
abstract class DocumentExtractor
{
	/**
	 * The source filename from which to extract text.
	 *
	 * @var string
	 */
	protected $sourcefile;

	/**
	 * The target filename, where the extracted text must be stored.
	 *
	 * @var string
	 */
	protected $targetfile;

	/**
	 * The mime type of the source file.
	 *
	 * @var string
	 */
	protected $mimetype;

	/**
	 * The extension of the source file.
	 *
	 * @var string
	 */
	protected $extension;

	/**
	 * Reference to the document being indexed.
	 *
	 * @var Document
	 */
	protected $document;

	/**
	 * Indicates if the extractor needs an intermediate file or not.
	 * Generally the source file will be a file within the respository itself. Some extractors may
	 * require the source file to have the correct extension. Setting this to true will result in
	 * a file being created with the extension of the file. It is ideal to disable this if possible.
	 *
	 * @var boolean
	 */
	protected $needsIntermediate;

	/**
	 * The status of the extraction. If null, the extraction has not been done yet.
	 *
	 * @var boolean
	 */
	protected $extractionStatus;

	/**
	 * The status of the indexing. If null, the indexing has not been done yet.
	 *
	 * @var boolean
	 */
	protected $indexStatus;

	/**
	 * If an error occurred, this is the output that was captured
	 *
	 * @var string
	 */
	public $output;


	public function __construct()
	{
		$this->needsIntermediate=false;
		$this->extractionStatus = null;
		$this->indexStatus = null;
	}

	/**
	 * Sets the status of the indexing.
	 *
	 * @param unknown_type $status
	 */
	public function setIndexingStatus($status)
	{
		$this->indexStatus = $status;
	}
	/**
	 * Returns the indexing status.
	 *
	 * @return boolean
	 */
	public function getIndexingStatus()
	{
		return $this->indexStatus;
	}

	/**
	 * Sets the extraction status.
	 *
	 * @param boolean $status
	 */
	public function setExtractionStatus($status)
	{
		$this->extractionStatus = $status;
	}
	/**
	 * Return the extraction status.
	 *
	 * @return boolean
	 */
	public function getExtractionStatus()
	{
		return $this->extractionStatus;
	}

	/**
	 * This associates all the mime types associated with the extractor class.
	 *
	 */
	public function registerMimeTypes()
	{
		$types = $this->getSupportedMimeTypes();
		if (empty($types))
		{
			return;
		}
		$classname=get_class($this);

		$sql = "select id as extractor_id from mime_extractors WHERE name='$classname'";
		$rs = DBUtil::getResultArray($sql);
		if (count($rs) == 0)
		{
			$extractor_id = DBUtil::autoInsert('mime_extractors', array('name'=>$classname, 'active'=>1));
		}
		else
		{
			$extractor_id = $rs[0]['extractor_id'];
		}


		foreach($types as $type)
		{
			$sql = "update mime_types set extractor_id=$extractor_id where mimetypes='$type' and extractor_id is null";
			$rs = DBUtil::runQuery($sql);
		}
	}

	/**
	 * Indicates if an intermediate file is required.
	 *
	 * @param $value boolean Optional. If set, we set the value.
	 * @return boolean
	 */
	public function needsIntermediateSourceFile($value = null)
	{
		if (!is_null($value))
		{
			$this->needsIntermediate = $value;
		}
		return $this->needsIntermediate;
	}

	/**
	 * Sets the source filename for the document extractor.
	 *
	 * @param string $sourcefile
	 */
	public function setSourceFile($sourcefile)
	{
		$this->sourcefile=$sourcefile;
	}

	/**
	 * Returns the source file name.
	 *
	 * @return string
	 */
	public function getSourceFile() { return $this->sourcefile; }

	/**
	 * Sets the source file's mime type.
	 *
	 * @param string $mimetype
	 */
	public function setMimeType($mimetype)
	{
		$this->mimetype=$mimetype;
	}
	/**
	 * Returns the mime type for the source file.
	 *
	 * @return string
	 */
	public function getMimeType() { return $this->mimetype; }

	/**
	 * Indicates the extension for the source file.
	 *
	 * @param string $extension
	 */
	public function setExtension($extension)
	{
		$this->extension=$extension;
	}
	/**
	 * Returns the extension of the source file.
	 *
	 * @return string
	 */
	public function getExtension() { return $this->extension; }

	/**
	 * Sets the file name of the target text file.
	 *
	 * @param string $targetfile
	 */
	public function setTargetFile($targetfile)
	{
		$this->targetfile=$targetfile;
	}

	/**
	 * Gets the file name of the target text file containing the extracted text.
	 *
	 * @return unknown
	 */
	public function getTargetFile() { return $this->targetfile; }

	/**
	 * Filter function that may be applied after extraction. This may be overridden.
	 *
	 * @param string $text
	 * @return string
	 */
	protected function filter($text)
	{
		return $text;
	}

	/**
	 * Set the document that will be indexed.
	 *
	 * @param Document $document
	 */
	public function setDocument($document)
	{
		$this->document = $document;
	}

	/**
	 * Returns a reference to the document.
	 *
	 * @return string
	 */
	public function getDocument()
	{
		return $this->document;
	}

	/**
	 * Returns an array of supported mime types.
	 * e.g. return array('plain/text');
	 *
	 *
	 * @return array
	 *
	 */
	public abstract function getSupportedMimeTypes();

	/**
	 * Extracts the content from the source file.
	 *
	 * @return boolean
	 */
	public abstract function extractTextContent();

	/**
	 * Returns a friendly name for the document text extractor.
	 *
	 * @return string
	 */
	public abstract function getDisplayName();

	/**
	 * Attempts to diagnose any problems with the indexing process.
	 *
	 * @return string
	 */
	public abstract function diagnose();

}

/**
 * This class extends the document extractor to execute some command line application.
 * The getCommandLine() method needs to be overridden.
 *
 */
abstract class ExternalDocumentExtractor extends DocumentExtractor
{
    protected $allowOutput = false;
    protected $pipeStdoutToDevNull = false;

	/**
	 * Initialise the extractor.
	 *
	 */
	public function __construct()
	{
		parent::__construct();
		putenv('LANG=en_US.UTF-8');

		$config = KTConfig::getSingleton();

		$default = realpath(str_replace('\\','/',KT_DIR . '/../openoffice/program'));

		putenv('ooProgramPath=' . $config->get('openoffice/programPath', $default));
	}

	public function setAllowOutput($allowOutput)
	{
	    $this->allowOutput = $allowOutput;
	}

	/**
	 * Executes a command. Returns true if successful.
	 *
	 * @param string $cmd A command line instruction.
	 * @return boolean
	 */
	protected  function exec($cmd)
	{
		$config = KTConfig::getSingleton();
		$temp_dir = $config->get('urls/tmpDirectory');
		$res = 0;

		$docid = $this->document->getId();

		$script_prefix = $temp_dir . '/' . time() . '-' . $docid;
		$script_out = $script_prefix . '.out';

		// define the scripts that we want

		if (OS_WINDOWS)
		{
			$script_name = $script_prefix . '.bat';

			$script = "rem This is an auto generated file. \n";
			$script .= $cmd . ' 2>"' . $script_out . "\"\r\n";
			$script .= "set er=%ERRORLEVEL%\r\n";
			$script .= "exit /B %er%\r\n";
		}
		else
		{
			$script_name = $script_prefix . '.sh';

			$script = "#!/bin/sh\n";
			$script .= "# This is an auto generated file. \n";
			$script .= $cmd . ' 2>>"' . $script_out . "\"";

			if ($this->pipeStdoutToDevNull)
			{
			    $script .= " >/dev/null";
			}

			$script .= "\n";

			$script .= "exit $?\n";
		}

		// write the script file
		if (file_put_contents($script_name, $script) === false)
		{
			$this->output = _kt('Could not create exec script: ') . $script_name;
			return false;
		}

		// execute the script file
		if (OS_WINDOWS)
		{
			$res = KTUtil::pexec("\"$script_name\"");
			$res = $res['ret'];
		}
		else
		{
			if (chmod($script_name, 0755) === false)
			{
				$this->output = _kt('Could change permission on exec script: ') . $script_name;
				return false;
			}
			system($script_name, $res);
		}

		// remote the script file and get the output if available
		@unlink($script_name);

		if (file_exists($script_out))
		{
			$this->output = file_get_contents($script_out);
			@unlink($script_out);
		}

		return ($res == 0) && (empty($this->output) || $this->allowOutput);
	}

	/**
	 * Returns the command line string to be executed.
	 * The command returned should include the target filename.
	 *

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?