extractorcore.inc.php

来自「PHP 知识管理系统(基于树结构的知识管理系统), 英文原版的PHP源码。」· PHP 代码 · 共 899 行 · 第 1/2 页

PHP
899
字号
	 * @return string
	 */
	protected function getCommandLine()
	{
		throw new Exception(_kt('getCommandLine is not implemented'));
	}

	/**
	 * Executes the command that executes the command.
	 * Returns true if success.
	 *
	 * @return boolean
	 */
	public function extractTextContent()
	{
		global $default;

		$cmdline = $this->getCommandLine();

		$class = get_class($this);
		$default->log->debug("$class: "  . $cmdline);

		return $this->exec($cmdline);
	}

}

abstract class OOFallbackDocumentExtractor extends ExternalDocumentExtractor
{
    protected $cmd;
    protected $params;

    /**
     * Enter description here...
     *
     * @var StarOfficeExtractor
     */
    protected $oo;

    public function __construct($cmd, $params)
    {
        parent::__construct();
        $this->cmd = KTUtil::findCommand('externalBinary/' . $cmd, false);

        $config = KTConfig::getSingleton();
        $this->params = $config->get('indexer/' . $cmd . 'cmdline', $params);
        $this->useOO = $config->get('indexer/useOpenOffice', true);
        if (!$config->get('indexer/use_' . $cmd, true) || OS_WINDOWS)
        {
            $this->cmd = false;
        }

        if ($this->useOO)
        {
            require_once('extractors/StarOfficeExtractor.inc.php');
            $this->oo = new StarOfficeExtractor();
        }
    }

	public function needsIntermediateSourceFile()
	{
		// we need the intermediate file because it
		// has the correct extension. documentConverter uses the extension to determine mimetype

		return ($this->useOO);
	}

	protected function getCommandLine()
	{
		$sourcefile = $this->sourcefile;
		$targetfile = $this->targetfile;
		$escape = '"';

        $cmd = $this->cmd;

		$cmdline = $this->params;
		$cmdline = eval("return \"$cmdline\";");

		$cmdline = str_replace('\\','/',$cmdline);

		return $cmdline;
	}


    public function extractTextContent()
    {
        if ($this->cmd !== false)
        {
            // so we have catppt or something
            $result = parent::extractTextContent();
            if ($result !== false)
            {
                // if it returns true, we can bail
                return true;
            }

            // if failure, fallthrough, and attempt OO
        }

        if ($this->useOO)
        {
            $this->oo->setSourceFile($this->sourcefile);
            $this->oo->setMimeType($this->mimetype);
            $this->oo->setExtension($this->extension);
            $this->oo->setTargetFile($this->targetfile);
            $this->oo->setDocument($this->document);
            $this->oo->setIndexingStatus(null);
            $this->oo->setExtractionStatus(null);

            $result = $this->oo->extractTextContent();

            $this->setIndexingStatus($this->oo->getIndexingStatus());
            $this->setExtractionStatus($this->oo->getExtractionStatus());
            $this->setTargetFile($this->oo->getTargetFile());

            return $result;
        }
        else
        {
            global $default;
            $docId = $this->document->getId();
            $cmd = $this->cmd;
            $default->log->info("The document {$docId} cannot be indexed as {$cmd} is not available and OpenOffice is not in use.");
            file_put_contents($this->targetfile, '');
            return true;
        }
    }

    public function diagnose()
    {
        if ($this->cmd !== false || !$this->useOO)
        {
            // cmd is found. we don't care about oo.
            // if we can't use oo, well, not much we can do....
            return null;
        }

        return $this->oo->diagnose();
    }
}

/**
 * An extension to the extenal document extractor. A derived class simply needs
 * to implement a constructor and getSupportedMimeTypes().
 *
 */
abstract class ApplicationExtractor extends ExternalDocumentExtractor
{
	/**
	 * The full path to the application that will be run. This will be resolved from
	 * the path or using the config file.
	 *
	 * @var string
	 */
	private $application;
	/**
	 * The command name of the application that can be run.
	 *
	 * @var string
	 */
	private $command;
	/**
	 * This is the friendly name for the extractor.
	 *
	 * @var string
	 */
	private $displayname;
	/**
	 * The command line parameters for the application.
	 * This may include {source} and {target} where substitutions will be done.
	 *
	 * @var string
	 */
	private $params;

	/**
	 * Initialise the extractor.
	 *
	 * @param string $section The section in the config file.
	 * @param string $appname The application name in the config file.
	 * @param string $command The command that can be run.
	 * @param string $displayname
	 * @param string $params
	 */
	public function __construct($section, $appname, $command, $displayname, $params)
	{
		parent::__construct();

		$this->application = KTUtil::findCommand("$section/$appname", $command);
		$this->command = $command;
		$this->displayname = $displayname;
		$this->params = $params;
	}

	/**
	 * Return the display name.
	 *
	 * @return string
	 */
	public function getDisplayName()
	{
		return _kt($this->displayname);
	}

	/**
	 * Returns the command line after performing substitutions.
	 *
	 * @return unknown
	 */
	protected function getCommandLine()
	{
		$sources = array('{source}','{target}');
		$target = array($this->sourcefile, $this->targetfile);
		$escape = OS_WINDOWS?'"':'\'';
		$cmdline = $escape . $this->application . $escape . ' ' . str_replace($sources,$target, $this->params);

		return $cmdline;
	}

	/**
	 * Identifies if there are any circumstances why the command can not run that could result in the text extraction process
	 * failing.
	 *
	 * @return mixed Returns string if there is a problem, null otherwise.
	 */
	public function diagnose()
	{
		if (false === $this->application)
		{
			return sprintf(_kt("Cannot locate binary for %s (%s)."), $this->displayname, $this->command);
		}

		return null;
	}
}

abstract class TextExtractor extends DocumentExtractor
{
	/**
	 * This extracts the text from the document.
	 *
	 * @return boolean
	 */
	public function extractTextContent()
	{

	    $config = KTConfig::getSingleton();
		$maxTextSize = $config->get('indexer/maxTextSize', 1024 * 1024 * 10); // we'll only take 10 meg by default
		$content = file_get_contents($this->sourcefile, null, null, null, $maxTextSize);
		if (false === $content)
		{
			return false;
		}

		$result = file_put_contents($this->targetfile, $this->filter($content));

		return false !== $result;
	}

	/**
	 * There are no external dependancies to diagnose.
	 *
	 * @return null
	 */
	public function diagnose()
	{
		return null;
	}

}

/**
 * The composite extractor implies that a conversion is done to an intermediate form before another extractor is run.
 *
 */
abstract class CompositeExtractor extends DocumentExtractor
{
	/**
	 * The initial extractor
	 *
	 * @var DocumentExtractor
	 */
	private $sourceExtractor;
	/**
	 * The text extractor
	 *
	 * @var DocumentExtractor
	 */
	private $targetExtractor;
	/**
	 * The extension for the initial extraction
	 *
	 * @var string
	 */
	private $targetExtension;
	/**
	 * The mime type of the initial extraction.
	 *
	 * @var string
	 */
	private $targetMimeType;

	public function __construct($sourceExtractor, $targetExtension, $targetMimeType, $targetExtractor, $needsIntermediate)
	{
		$this->sourceExtractor = $sourceExtractor;
		$this->targetExtractor = $targetExtractor;
		$this->targetExtension = $targetExtension;
		$this->targetMimeType = $targetMimeType;
		$this->needsIntermediateSourceFile($needsIntermediate);
	}

	/**
	 * Extracts the content of the document
	 *
	 * @return string
	 */
	public function extractTextContent()
	{
		$intermediateFile = $this->targetfile . '.' . $this->targetExtension;
		touch($intermediateFile);

		$this->sourceExtractor->setSourceFile($this->sourcefile);
		$this->sourceExtractor->setTargetFile($intermediateFile);
		$this->sourceExtractor->setDocument($this->getDocument());
		$this->sourceExtractor->setMimeType($this->mimetype);
		$this->sourceExtractor->setExtension($this->extension);
		if (!$this->sourceExtractor->extractTextContent())
		{
			$this->output = $this->sourceExtractor->output;
			@unlink($intermediateFile);
			return false;
		}
		$intermediateFile = $this->sourceExtractor->getTargetFile();

		$this->targetExtractor->setSourceFile($intermediateFile);
		$this->targetExtractor->setTargetFile($this->targetfile);
		$this->targetExtractor->setDocument($this->getDocument());
		$this->targetExtractor->setMimeType($this->targetMimeType);
		$this->targetExtractor->setExtension($this->targetExtension);
		$result = $this->targetExtractor->extractTextContent();
		if (!$result)
		{
			$this->output = $this->targetExtractor->output;
		}

		@unlink($intermediateFile);
		$this->setTargetFile($this->targetExtractor->getTargetFile());

		return $result;
	}

	/**
	 * Diagnose the extractors
	 *
	 * @return mixed
	 */
	public function diagnose()
	{
		$diagnosis = $this->sourceExtractor->diagnose();
		if (!empty($diagnosis))
		{
			return $diagnosis;
		}

		$diagnosis = $this->targetExtractor->diagnose();
		if (!empty($diagnosis))
		{
			return $diagnosis;
		}

		return null;
	}
}


/**
 * The purpose of an extractor hook is to effect the
 *
 */
abstract class ExtractorHook
{
	/**
	 * Returns an array of supported mime types.
	 * e.g. return array('plain/text');
	 *
	 *
	 * @return array
	 *
	 */
	public abstract function getSupportedMimeTypes();

	/**
	 * Returns the friendly name for the hook.
	 *
	 * @return string
	 */
	public abstract function getDisplayName();

	/**
	 * This does a basic diagnosis on the hook.
	 *
	 * @return string
	 */
	public function diagnose()
	{
		return null;
	}

	/**
	 * Perform any pre extraction activities.
	 *
	 * @param DocumentExtractor $extractor
	 */
	public function pre_extract($extractor)
	{
	}

	/**
	 * Perform any post extraction activities.
	 *
	 * @param DocumentExtractor $extractor
	 */
	public function post_extract($extractor)
	{

	}

	/**
	 * Perform any pre indexing activities.
	 *
	 * @param DocumentExtractor $extractor
	 */
	public function pre_index($extractor)
	{

	}

	/**
	 * Perform any post indexing activities.
	 *
	 * @param DocumentExtractor $extractor
	 */
	public function post_index($extractor)
	{

	}
}

?>

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?