📄 class.indexer.php
字号:
if(count($expBody)>1) { $body = ''; foreach($expBody as $val) { $part = explode('-->',$val,2); if(trim($part[0])=='begin') { $body.= $part[1]; $prev = ''; } elseif(trim($part[0])=='end') { $body.= $prev; } else { $prev = $val; } } return true; } else { return false; } } /** * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed. * * @param string HTML content * @return void */ function extractLinks($content) { // Get links: $list = $this->extractHyperLinks($content); if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) { $this->includeCrawlerClass(); $crawler = t3lib_div::makeInstance('tx_crawler_lib'); } // Traverse links: foreach($list as $linkInfo) { // Decode entities: if ($linkInfo['localPath']) { // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here! $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']); } else { $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']); } // Parse URL: $qParts = parse_url($linkSource); // Check for jumpurl (TYPO3 specific thing...) if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) { parse_str($qParts['query'],$getP); $linkSource = $getP['jumpurl']; $qParts = parse_url($linkSource); // parse again due to new linkSource! } if ($qParts['scheme']) { if ($this->indexerConfig['indexExternalURLs']) { // Index external URL (http or otherwise) $this->indexExternalUrl($linkSource); } } elseif (!$qParts['query']) { if (t3lib_div::isAllowedAbsPath($linkSource)) { $localFile = $linkSource; } else { $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource); } if ($localFile && @is_file($localFile)) { // Index local file: if ($linkInfo['localPath']) { $fI = pathinfo($linkSource); $ext = strtolower($fI['extension']); if (is_object($crawler)) { $params = array( 'document' => $linkSource, 'alturl' => $linkInfo['href'], 'conf' => $this->conf ); unset($params['conf']['content']); $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']); $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1); } else { $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext); } } else { if (is_object($crawler)) { $params = array( 'document' => $linkSource, 'conf' => $this->conf ); unset($params['conf']['content']); $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']); $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1); } else { $this->indexRegularDocument($linkSource); } } } } } } /** * Extracts all links to external documents from content string. * * @param string Content to analyse * @return array Array of hyperlinks * @see extractLinks() */ function extractHyperLinks($string) { if (!is_object($this->htmlParser)) { $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml'); } $parts = $this->htmlParser->splitTags('a',$string); $list = array(); foreach ($parts as $k => $v) { if ($k%2) { $params = $this->htmlParser->get_tag_attributes($v,1); $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag switch (strtolower($firstTagName)) { case 'a': $src = $params[0]['href']; if ($src) { // Check if a local path to that file has been set - useful if you are using a download script. $md5 = t3lib_div::shortMD5($src); if (is_array($indexLocalFiles=$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'])) { $localPath = isset($indexLocalFiles[$md5]) ? $indexLocalFiles[$md5] : ''; } else $localPath=false; $list[] = array( 'tag' => $v, 'href' => $params[0]['href'], 'localPath' => $localPath ); } break; } } } return $list; } /****************************************** * * Indexing; external URL * ******************************************/ /** * Index External URLs HTML content * * @param string URL, eg. "http://typo3.org/" * @return void * @see indexRegularDocument() */ function indexExternalUrl($externalUrl) { // Parse External URL: $qParts = parse_url($externalUrl); $fI = pathinfo($qParts['path']); $ext = strtolower($fI['extension']); // Get headers: $urlHeaders = $this->getUrlHeaders($externalUrl); if (stristr($urlHeaders['Content-Type'],'text/html')) { $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl); if (strlen($content)) { // Create temporary file: $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html'; t3lib_div::writeFile($tmpFile, $content); // Index that file: $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html'); // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?) unlink($tmpFile); } } } /** * Getting HTTP request headers of URL * * @param string The URL * @param integer Timeout (seconds?) * @return mixed If no answer, returns false. Otherwise an array where HTTP headers are keys */ function getUrlHeaders($url) { $content = t3lib_div::getURL($url,2); // Try to get the headers only if (strlen($content)) { // Compile headers: $headers = t3lib_div::trimExplode(chr(10),$content,1); $retVal = array(); foreach($headers as $line) { if (!strlen(trim($line))) { break; // Stop at the first empty line (= end of header) } list($headKey, $headValue) = explode(':', $line, 2); $retVal[$headKey] = $headValue; } return $retVal; } } /****************************************** * * Indexing; external files (PDF, DOC, etc) * ******************************************/ /** * Indexing a regular document given as $file (relative to PATH_site, local file) * * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL * @param boolean If set, indexing is forced (despite content hashes, mtime etc). * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL. * @param string File extension for temporary file. * @return void */ function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') { // Init $fI = pathinfo($file); $ext = $altExtension ? $altExtension : strtolower($fI['extension']); // Create abs-path: if (!$contentTmpFile) { if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site: $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file); } else { // Absolute, pass-through: $absFile = $file; } $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : ''; } else { $absFile = $contentTmpFile; } // Indexing the document: if ($absFile && @is_file($absFile)) { if ($this->external_parsers[$ext]) { $mtime = filemtime($absFile); $cParts = $this->fileContentParts($ext,$absFile); foreach($cParts as $cPKey) { $this->internal_log = array(); $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),''); $Pstart = t3lib_div::milliseconds(); $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3" $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo); $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']); if ($check > 0 || $force) { if ($check > 0) { $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1); } else { $this->log_setTSlogMessage('Indexing forced by flag',1); } // Check external file counter: if ($this->externalFileCounter < $this->maxExternalFiles || $force) { // Divide into title,keywords,description and body: $this->log_push('Split content',''); $contentParts = $this->readFileContent($ext,$absFile,$cPKey); $this->log_pull(); if (is_array($contentParts)) { // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent()) $content_md5h = $this->md5inthash(implode($contentParts,'')); if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) { // Increment counter: $this->externalFileCounter++; // Splitting words $this->log_push('Extract words from content',''); $splitInWords = $this->processWordsInArrays($contentParts); $this->log_pull(); // Analyse the indexed words. $this->log_push('Analyse the extracted words',''); $indexArr = $this->indexAnalyze($splitInWords); $this->log_pull(); // Submitting page (phash) record $this->log_push('Submitting page',''); $size = filesize($absFile); $ctime = filemtime($absFile); // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time... $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts); $this->log_pull(); // Check words and submit to word list if not there $this->log_push('Check word list and submit words',''); $this->checkWordList($indexArr); $this->submitWords($indexArr,$phash_arr['phash']); $this->log_pull(); // Set parsetime $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart); } else { $this->updateTstamp($phash_arr['phash'],$mtime); // Update the timestamp $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.'); } } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.'); } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.'); } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]); // Checking and setting sections: # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group) $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed. $this->log_pull(); } } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.'); } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.'); } /** * Reads the content of an external file being indexed. * The content from the external parser MUST be returned in utf-8! * * @param string File extension, eg. "pdf", "doc" etc. * @param string Absolute filename of file (must exist and be validated OK before calling function) * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.) * @return array Standard content array (title, description, keywords, body keys) */ function readFileContent($ext,$absFile,$cPKey) { // Consult relevant external document parser:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -