openxmltextextractor.inc.php

来自「PHP 知识管理系统（基于树结构的知识管理系统）, 英文原版的PHP源码。」· PHP 代码 · 共 322 行
PHP
322 行
<?php

/**
 * $Id:$
 *
 * KnowledgeTree Community Edition
 * Document Management Made Simple
 * Copyright (C) 2008 KnowledgeTree Inc.
 * Portions copyright The Jam Warehouse Software (Pty) Limited
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License version 3 as published by the
 * Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
 * California 94120-7775, or email info@knowledgetree.com.
 *
 * The interactive user interfaces in modified source and object code versions
 * of this program must display Appropriate Legal Notices, as required under
 * Section 5 of the GNU General Public License version 3.
 *
 * In accordance with Section 7(b) of the GNU General Public License version 3,
 * these Appropriate Legal Notices must retain the display of the "Powered by
 * KnowledgeTree" logo and retain the original copyright notice. If the display of the
 * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
 * must display the words "Powered by KnowledgeTree" and retain the original
 * copyright notice.
 * Contributor( s): ______________________________________
 *
 */

class OpenXmlTextExtractor extends ExternalDocumentExtractor
{
	public function __construct()
	{
		$config = KTConfig::getSingleton();

		$this->unzip = KTUtil::findCommand("import/unzip", 'unzip');
		$this->unzip = str_replace('\\','/',$this->unzip);
		$this->unzip_params = $config->get('extractorParameters/unzip', '"{source}" "{part}" -d "{target_dir}"');
		parent::__construct();
	}


	/**
	 * Basic function setting the display name
	 *
	 * @return string
	 */
	public function getDisplayName()
	{
		return _kt('Open Xml Text Extractor');
	}

	public function needsIntermediateSourceFile()
	{
		return true;
	}

	/**
	 * Return a list of all Office 2007 document types that are supported
	 *
	 * @return array
	 */
	public function getSupportedMimeTypes()
	{
		return array(
			'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
			'application/vnd.openxmlformats-officedocument.wordprocessingml.template',
			'application/vnd.openxmlformats-officedocument.presentationml.template',
			'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
			'application/vnd.openxmlformats-officedocument.presentationml.presentation',
			'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
			'application/vnd.openxmlformats-officedocument.spreadsheetml.template'
		);
	}

	/**
	 * Trivial function to resolve if the document is word, excel, or power point
	 *
	 * @return array
	 */

	private function detectDocumentType()
	{
		$types = array(
			'docx' => array(
					'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
					'application/vnd.openxmlformats-officedocument.wordprocessingml.template'
				),
			'pptx' => array(
					'application/vnd.openxmlformats-officedocument.presentationml.template',
					'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
					'application/vnd.openxmlformats-officedocument.presentationml.presentation'),
			'xlsx' => array(
					'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
					'application/vnd.openxmlformats-officedocument.spreadsheetml.template'),

		);
		foreach($types as $key=>$types)
		{
			if (in_array($this->mimetype, $types))
			{
				return $key;
			}
		}
	}

	/**
	 * The open xml file comprises various file with different content. This function identifies
	 * which of those content types are worth indexing.
	 *
	 * @param string $openxml_type
	 * @param string $mime_type
	 * @return boolean
	 */
	private function interestingParts($openxml_type, $mime_type)
	{
		$interest = array(
			'docx'=> array(
					'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml',
					'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml',
			 		'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml',
			 		'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml'),

			 'pptx' => array('application/vnd.openxmlformats-officedocument.presentationml.slide+xml'),
			 'xlsx' => array(
					'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml',
			 		'application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml',
			 		'application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml',
			 		'application/vnd.openxmlformats-package.core-properties+xml'));
		return in_array($mime_type, $interest[$openxml_type]);
	}

	/**
	 * Returns a list of tokens that were identified by the [Content_Types].xml file. This file lists links to all parts of the document.
	 * We use interestingParts() above to identify which of these parts are interesting from a content perspective.
	 *
	 * @return array
	 */
	private function getOpenXmlContentTypes()
	{
		$config = KTConfig::getSingleton();
		$temp_dir = $config->get('urls/tmpDirectory');

		$docid = $this->document->getId();
		$time = 'ktindexer_openxml_'. time() . '-' . $docid;
		$this->openxml_dir = $temp_dir . '/' . $time;

		$this->sourcefile = str_replace('\\','/',$this->sourcefile);
		$this->openxml_dir = str_replace('\\','/',$this->openxml_dir);

		$cmd = '"' . $this->unzip . '"' . ' ' . str_replace(
			array('{source}','{part}', '{target_dir}'),
			array($this->sourcefile, '*Content_Types*.xml',$this->openxml_dir), $this->unzip_params);

		$cmd = str_replace('\\','/', $cmd);

 		if (!$this->exec($cmd))
		{
			$this->output = _kt('Failed to execute command: ') . $cmd;
			return false;
		}

		$filename = $this->openxml_dir . '/[Content_Types].xml';
		if (!file_exists($filename))
		{
			$this->output = _kt('Failed to find file: ') . $filename;
			return false;
		}

		$xml_content = file_get_contents($filename);

		// once we have the content, we can cleanup!
		@unlink($filename);

		// parse the file
		$parser = xml_parser_create();
		xml_parse_into_struct($parser, $xml_content, $vals, $index);
		xml_parser_free($parser);

		return $vals;
	}

	/**
	 * Extract the text from a file within the archive for a specific file.
	 *
	 * @param string $filename
	 * @return string
	 */
	private function getContent($filename)
	{
		$config = KTConfig::getSingleton();

		if (substr($filename,0,1) == '/')
		{
			$filename = substr($filename,1);
		}
		$filename = str_replace('\\','/',$filename);

		$cmd = '"' .$this->unzip . '"' . ' ' . str_replace(
			array('{source}','{part}', '{target_dir}'),
			array($this->sourcefile, $filename,$this->openxml_dir), $this->unzip_params);

		if (!$this->exec($cmd))
		{
			$this->output = _kt('Failed to execute command: ') . $cmd;
			return false;
		}

		$filename = $this->openxml_dir . "/$filename";
		if (!file_exists($filename))
		{
			$this->output = _kt('Failed to open file: ') . $filename;
			return false;
		}

		$content = file_get_contents($filename);

		// cleanup
		@unlink($filename);

		$content = preg_replace ("@(</?[^>]*>)+@", " ", $content);

		return $content;
	}


	/**
	 * Given the tokens in the [Content_Types].xml, extract the content
	 *
	 * @param array $vals
	 * @return string
	 */
	function getOpenXmlText($vals)
	{
		$openxml_type = $this->detectDocumentType();

		$content = '';

		foreach($vals as $val)
		{
			if ($val['tag'] == 'OVERRIDE' && $val['type'] == 'complete')
			{
				if ($this->interestingParts($openxml_type, $val['attributes']['CONTENTTYPE']))
				{
					$filename = $val['attributes']['PARTNAME'];
					$result = $this->getContent($filename);

					if ($result === false)
					{
						return false;
					}

					$content .= $result;
				}
			}
		}

		return $content;
	}

	/**
	 * The main context extraction function
	 *
	 * @return bool
	 */

	public function extractTextContent()
	{
		$xml_content = $this->getOpenXmlContentTypes();

		if ($xml_content !== false)
		{
			$content = $this->getOpenXmlText($xml_content);

			if ($content !== false)
			{
				$result = file_put_contents($this->targetfile, $this->filter($content));

				if ($result === false)
				{
					$this->output = _kt('Could not save content to file: ') . $this->targetfile;
					KTUtil::deleteDirectory($this->openxml_dir);
					return false;
				}
			}
			KTUtil::deleteDirectory($this->openxml_dir);

			return true;
		}
		KTUtil::deleteDirectory($this->openxml_dir);

		return false;

	}

	/**
	 * Check that unzip is available
	 *
	 * @return boolean
	 */
	public function diagnose()
	{
		if (false === $this->unzip)
		{
			return sprintf(_kt("Cannot locate unzip: %s."), $this->unzip);
		}
		return null;
	}

}

?>
openxmltextextractor.inc.php - 源码说明

本页面展示了「PHP 知识管理系统（基于树结构的知识管理系统）, 英文原版的PHP源码。」中的 openxmltextextractor.inc.php 源码文件，采用 PHP 编程语言编写，共 322 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与PHP相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?