📄 class.external_parser.php

📁 Typo3, 开源里边最强大的
💻 PHP
📖 第 1 页 / 共 2 页
字号:
上一页 12
	/**	 * Returns true if the input extension (item_type) is a potentially a multi-page extension	 *	 * @param	string		Extension / item_type string	 * @return	boolean		Return true if multi-page	 */	function isMultiplePageExtension($extension)	{			// Switch on file extension:		switch((string)$extension)	{			case 'pdf':				return TRUE;			break;		}	}	/************************	 *	 * Reading documents (for parsing)	 *	 ************************/	/**	 * Reads the content of an external file being indexed.	 *	 * @param	string		File extension, eg. "pdf", "doc" etc.	 * @param	string		Absolute filename of file (must exist and be validated OK before calling function)	 * @param	string		Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)	 * @return	array		Standard content array (title, description, keywords, body keys)	 */	function readFileContent($ext,$absFile,$cPKey)	{		unset($contentArr);			// Return immediately if initialization didn't set support up:		if (!$this->supportedExtensions[$ext])	return FALSE;			// Switch by file extension		switch ($ext)	{			case 'pdf':				if ($this->app['pdfinfo'])	{						// Getting pdf-info:					$cmd = $this->app['pdfinfo'].' "'.$absFile.'"';					exec($cmd,$res);					$pdfInfo = $this->splitPdfInfo($res);					unset($res);					if (intval($pdfInfo['pages']))	{						list($low,$high) = explode('-',$cPKey);							// Get pdf content:						$tempFileName = t3lib_div::tempnam('Typo3_indexer');		// Create temporary name						@unlink ($tempFileName);	// Delete if exists, just to be safe.						$cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q "'.$absFile.'" '.$tempFileName;						exec($cmd);						if (@is_file($tempFileName))	{							$content = t3lib_div::getUrl($tempFileName);							unlink($tempFileName);						} else {							$this->pObj->log_setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2);						}						if (strlen($content))	{							$contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));						}					}				}			break;			case 'doc':				if ($this->app['catdoc'])	{					$cmd = $this->app['catdoc'].' -d utf-8 "'.$absFile.'"';					exec($cmd,$res);					$content = implode(chr(10),$res);					unset($res);					$contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));				}			break;			case 'pps':			case 'ppt':				if ($this->app['ppthtml'])	{					$cmd = $this->app['ppthtml'].' "'.$absFile.'"';					exec($cmd,$res);					$content = implode(chr(10),$res);					unset($res);					$content = $this->pObj->convertHTMLToUtf8($content);					$contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));					$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!				}			break;			case 'xls':				if ($this->app['xlhtml'])	{					$cmd = $this->app['xlhtml'].' -nc -te "'.$absFile.'"';					exec($cmd,$res);					$content = implode(chr(10),$res);					unset($res);					$content = $this->pObj->convertHTMLToUtf8($content);					$contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));					$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!				}			break;			case 'sxi':			case 'sxc':			case 'sxw':			case 'ods':			case 'odp':			case 'odt':				if ($this->app['unzip'])	{						// Read content.xml:					$cmd = $this->app['unzip'].' -p "'.$absFile.'" content.xml';					exec($cmd,$res);					$content_xml = implode(chr(10),$res);					unset($res);						// Read meta.xml:					$cmd = $this->app['unzip'].' -p "'.$absFile.'" meta.xml';					exec($cmd, $res);					$meta_xml = implode(chr(10),$res);					unset($res);					$utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));					$contentArr = $this->pObj->splitRegularContent($utf8_content);					$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!						// Meta information					$metaContent = t3lib_div::xml2tree($meta_xml);					$metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];					if (is_array($metaContent))	{						$contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];						$contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];							// Keywords collected:						if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword']))	{							foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat)	{								$contentArr['keywords'].= $kwDat['values'][0].' ';							}						}					}				}			break;			case 'rtf':				if ($this->app['unrtf'])	{					$cmd = $this->app['unrtf'].' "'.$absFile.'"';					exec($cmd,$res);					$fileContent = implode(chr(10),$res);					unset($res);					$fileContent = $this->pObj->convertHTMLToUtf8($fileContent);					$contentArr = $this->pObj->splitHTMLContent($fileContent);				}			break;			case 'txt':			case 'csv':		// Raw text				$content = t3lib_div::getUrl($absFile);					// TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)				$content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');				$contentArr = $this->pObj->splitRegularContent($content);				$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!			break;			case 'html':			case 'htm':				$fileContent = t3lib_div::getUrl($absFile);				$fileContent = $this->pObj->convertHTMLToUtf8($fileContent);				$contentArr = $this->pObj->splitHTMLContent($fileContent);			break;			case 'xml':		// PHP strip-tags()				$fileContent = t3lib_div::getUrl($absFile);					// Finding charset:				eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg);				$charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';					// Converting content:				$fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);				$contentArr = $this->pObj->splitRegularContent($fileContent);				$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!			break;			case 'jpg':		// PHP EXIF			case 'jpeg':	// PHP EXIF			case 'tif':		// PHP EXIF				if (function_exists('exif_read_data'))	{					$exif = exif_read_data($absFile, 'IFD0');				} else {					$exif = FALSE;				}				if ($exif)	{					$comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']);	// The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.				} else {					$comment = '';				}				$contentArr = $this->pObj->splitRegularContent($comment);				$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!			break;			default:				return false;			break;		}			// If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.		if (is_array($contentArr) && !$contentArr['title'])	{			$contentArr['title'] = str_replace('_',' ',basename($absFile));	// Substituting "_" for " " because many filenames may have this instead of a space char.		}		return $contentArr;	}	/**	 * Creates an array with pointers to divisions of document.	 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.	 *	 * @param	string		File extension	 * @param	string		Absolute filename (must exist and be validated OK before calling function)	 * @return	array		Array of pointers to sections that the document should be divided into	 */	function fileContentParts($ext,$absFile)	{		$cParts = array(0);		switch ($ext)	{			case 'pdf':					// Getting pdf-info:				$cmd = $this->app['pdfinfo'].' "'.$absFile.'"';				exec($cmd,$res);				$pdfInfo = $this->splitPdfInfo($res);				unset($res);				if (intval($pdfInfo['pages']))	{					$cParts = array();						// Calculate mode					if ($this->pdf_mode>0)	{						$iter = ceil($pdfInfo['pages']/$this->pdf_mode);					} else {						$iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);					}						// Traverse and create intervals.					for ($a=0;$a<$iter;$a++)	{						$low = floor($a*($pdfInfo['pages']/$iter))+1;						$high = floor(($a+1)*($pdfInfo['pages']/$iter));						$cParts[] = $low.'-'.$high;					}				}			break;		}		return $cParts;	}	/**	 * Analysing PDF info into a useable format.	 *	 * @param	array		Array of PDF content, coming from the pdfinfo tool	 * @return	array		Result array	 * @access private	 * @see fileContentParts()	 */	function splitPdfInfo($pdfInfoArray)	{		$res = array();		if (is_array($pdfInfoArray))	{			foreach($pdfInfoArray as $line)	{				$parts = explode(':',$line,2);				if (count($parts)>1 && trim($parts[0]))	{					$res[strtolower(trim($parts[0]))] = trim($parts[1]);				}			}		}		return $res;	}	/**	 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.	 *	 * @param	string		String to clean up	 * @return	string		String	 */	function removeEndJunk($string)	{		return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string));	}	/************************	 *	 * Backend analyzer	 *	 ************************/	/**	 * Return icon for file extension	 *	 * @param	string		File extension, lowercase.	 * @return	string		Relative file reference, resolvable by t3lib_div::getFileAbsFileName()	 */	function getIcon($extension)	{		if ($extension=='htm')	$extension = 'html';		if ($extension=='jpeg')	$extension = 'jpg';		return 'EXT:indexed_search/pi/res/'.$extension.'.gif';	}}if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php'])    {    include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);}?>
上一页 12
💿 文件大小 8829 K
👤 上传用户 horse2000
📂 所属分类企业管理
🏷️ 相关标签

#Typo #开源
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -