📄 class.external_parser.php
字号:
/** * Returns true if the input extension (item_type) is a potentially a multi-page extension * * @param string Extension / item_type string * @return boolean Return true if multi-page */ function isMultiplePageExtension($extension) { // Switch on file extension: switch((string)$extension) { case 'pdf': return TRUE; break; } } /************************ * * Reading documents (for parsing) * ************************/ /** * Reads the content of an external file being indexed. * * @param string File extension, eg. "pdf", "doc" etc. * @param string Absolute filename of file (must exist and be validated OK before calling function) * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.) * @return array Standard content array (title, description, keywords, body keys) */ function readFileContent($ext,$absFile,$cPKey) { unset($contentArr); // Return immediately if initialization didn't set support up: if (!$this->supportedExtensions[$ext]) return FALSE; // Switch by file extension switch ($ext) { case 'pdf': if ($this->app['pdfinfo']) { // Getting pdf-info: $cmd = $this->app['pdfinfo'].' "'.$absFile.'"'; exec($cmd,$res); $pdfInfo = $this->splitPdfInfo($res); unset($res); if (intval($pdfInfo['pages'])) { list($low,$high) = explode('-',$cPKey); // Get pdf content: $tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name @unlink ($tempFileName); // Delete if exists, just to be safe. $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q "'.$absFile.'" '.$tempFileName; exec($cmd); if (@is_file($tempFileName)) { $content = t3lib_div::getUrl($tempFileName); unlink($tempFileName); } else { $this->pObj->log_setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2); } if (strlen($content)) { $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content)); } } } break; case 'doc': if ($this->app['catdoc']) { $cmd = $this->app['catdoc'].' -d utf-8 "'.$absFile.'"'; exec($cmd,$res); $content = implode(chr(10),$res); unset($res); $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content)); } break; case 'pps': case 'ppt': if ($this->app['ppthtml']) { $cmd = $this->app['ppthtml'].' "'.$absFile.'"'; exec($cmd,$res); $content = implode(chr(10),$res); unset($res); $content = $this->pObj->convertHTMLToUtf8($content); $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content)); $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! } break; case 'xls': if ($this->app['xlhtml']) { $cmd = $this->app['xlhtml'].' -nc -te "'.$absFile.'"'; exec($cmd,$res); $content = implode(chr(10),$res); unset($res); $content = $this->pObj->convertHTMLToUtf8($content); $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content)); $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! } break; case 'sxi': case 'sxc': case 'sxw': case 'ods': case 'odp': case 'odt': if ($this->app['unzip']) { // Read content.xml: $cmd = $this->app['unzip'].' -p "'.$absFile.'" content.xml'; exec($cmd,$res); $content_xml = implode(chr(10),$res); unset($res); // Read meta.xml: $cmd = $this->app['unzip'].' -p "'.$absFile.'" meta.xml'; exec($cmd, $res); $meta_xml = implode(chr(10),$res); unset($res); $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml))); $contentArr = $this->pObj->splitRegularContent($utf8_content); $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! // Meta information $metaContent = t3lib_div::xml2tree($meta_xml); $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch']; if (is_array($metaContent)) { $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title']; $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0]; // Keywords collected: if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) { foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) { $contentArr['keywords'].= $kwDat['values'][0].' '; } } } } break; case 'rtf': if ($this->app['unrtf']) { $cmd = $this->app['unrtf'].' "'.$absFile.'"'; exec($cmd,$res); $fileContent = implode(chr(10),$res); unset($res); $fileContent = $this->pObj->convertHTMLToUtf8($fileContent); $contentArr = $this->pObj->splitHTMLContent($fileContent); } break; case 'txt': case 'csv': // Raw text $content = t3lib_div::getUrl($absFile); // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...) $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1'); $contentArr = $this->pObj->splitRegularContent($content); $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! break; case 'html': case 'htm': $fileContent = t3lib_div::getUrl($absFile); $fileContent = $this->pObj->convertHTMLToUtf8($fileContent); $contentArr = $this->pObj->splitHTMLContent($fileContent); break; case 'xml': // PHP strip-tags() $fileContent = t3lib_div::getUrl($absFile); // Finding charset: eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg); $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8'; // Converting content: $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset); $contentArr = $this->pObj->splitRegularContent($fileContent); $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! break; case 'jpg': // PHP EXIF case 'jpeg': // PHP EXIF case 'tif': // PHP EXIF if (function_exists('exif_read_data')) { $exif = exif_read_data($absFile, 'IFD0'); } else { $exif = FALSE; } if ($exif) { $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii. } else { $comment = ''; } $contentArr = $this->pObj->splitRegularContent($comment); $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! break; default: return false; break; } // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name. if (is_array($contentArr) && !$contentArr['title']) { $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char. } return $contentArr; } /** * Creates an array with pointers to divisions of document. * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back. * * @param string File extension * @param string Absolute filename (must exist and be validated OK before calling function) * @return array Array of pointers to sections that the document should be divided into */ function fileContentParts($ext,$absFile) { $cParts = array(0); switch ($ext) { case 'pdf': // Getting pdf-info: $cmd = $this->app['pdfinfo'].' "'.$absFile.'"'; exec($cmd,$res); $pdfInfo = $this->splitPdfInfo($res); unset($res); if (intval($pdfInfo['pages'])) { $cParts = array(); // Calculate mode if ($this->pdf_mode>0) { $iter = ceil($pdfInfo['pages']/$this->pdf_mode); } else { $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']); } // Traverse and create intervals. for ($a=0;$a<$iter;$a++) { $low = floor($a*($pdfInfo['pages']/$iter))+1; $high = floor(($a+1)*($pdfInfo['pages']/$iter)); $cParts[] = $low.'-'.$high; } } break; } return $cParts; } /** * Analysing PDF info into a useable format. * * @param array Array of PDF content, coming from the pdfinfo tool * @return array Result array * @access private * @see fileContentParts() */ function splitPdfInfo($pdfInfoArray) { $res = array(); if (is_array($pdfInfoArray)) { foreach($pdfInfoArray as $line) { $parts = explode(':',$line,2); if (count($parts)>1 && trim($parts[0])) { $res[strtolower(trim($parts[0]))] = trim($parts[1]); } } } return $res; } /** * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files. * * @param string String to clean up * @return string String */ function removeEndJunk($string) { return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string)); } /************************ * * Backend analyzer * ************************/ /** * Return icon for file extension * * @param string File extension, lowercase. * @return string Relative file reference, resolvable by t3lib_div::getFileAbsFileName() */ function getIcon($extension) { if ($extension=='htm') $extension = 'html'; if ($extension=='jpeg') $extension = 'jpg'; return 'EXT:indexed_search/pi/res/'.$extension.'.gif'; }}if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) { include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);}?>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -