📄 class.indexer.php

📁 Typo3, 开源里边最强大的
💻 PHP
📖 第 1 页 / 共 5 页
字号:
		if(count($expBody)>1) {			$body = '';			foreach($expBody as $val)	{				$part = explode('-->',$val,2);				if(trim($part[0])=='begin') {					$body.= $part[1];					$prev = '';				} elseif(trim($part[0])=='end') {					$body.= $prev;				} else {					$prev = $val;				}			}			return true;		} else {			return false;		}	}	/**	 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.	 *	 * @param	string		HTML content	 * @return	void	 */	function extractLinks($content) {			// Get links:		$list = $this->extractHyperLinks($content);		if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler'))	{			$this->includeCrawlerClass();			$crawler = t3lib_div::makeInstance('tx_crawler_lib');		}			// Traverse links:		foreach($list as $linkInfo)	{				// Decode entities:			if ($linkInfo['localPath'])	{	// localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!				$linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);			} else {				$linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);			}				// Parse URL:			$qParts = parse_url($linkSource);				// Check for jumpurl (TYPO3 specific thing...)			if ($qParts['query'] && strstr($qParts['query'],'jumpurl='))	{				parse_str($qParts['query'],$getP);				$linkSource = $getP['jumpurl'];				$qParts = parse_url($linkSource);	// parse again due to new linkSource!			}			if ($qParts['scheme'])	{				if ($this->indexerConfig['indexExternalURLs'])	{						// Index external URL (http or otherwise)					$this->indexExternalUrl($linkSource);				}			} elseif (!$qParts['query']) {				if (t3lib_div::isAllowedAbsPath($linkSource))	{					$localFile = $linkSource;				} else {					$localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);				}				if ($localFile && @is_file($localFile))	{						// Index local file:					if ($linkInfo['localPath'])	{						$fI = pathinfo($linkSource);						$ext = strtolower($fI['extension']);						if (is_object($crawler))	{							$params = array(								'document' => $linkSource,								'alturl' => $linkInfo['href'],								'conf' => $this->conf							);							unset($params['conf']['content']);							$crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);							$this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);						} else {							$this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);						}					} else {						if (is_object($crawler))	{							$params = array(								'document' => $linkSource,								'conf' => $this->conf							);							unset($params['conf']['content']);							$crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);							$this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);						} else {							$this->indexRegularDocument($linkSource);						}					}				}			}		}	}	/**	 * Extracts all links to external documents from content string.	 *	 * @param	string		Content to analyse	 * @return	array		Array of hyperlinks	 * @see extractLinks()	 */	function extractHyperLinks($string)	{		if (!is_object($this->htmlParser))	{			$this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');		}		$parts = $this->htmlParser->splitTags('a',$string);		$list = array();		foreach ($parts as $k => $v)	{			if ($k%2)	{				$params = $this->htmlParser->get_tag_attributes($v,1);				$firstTagName = $this->htmlParser->getFirstTagName($v);	// The 'name' of the first tag				switch (strtolower($firstTagName))	{					case 'a':						$src = $params[0]['href'];						if ($src)	{								// Check if a local path to that file has been set - useful if you are using a download script.							$md5 = t3lib_div::shortMD5($src);							if (is_array($indexLocalFiles=$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles']))	{								$localPath = isset($indexLocalFiles[$md5]) ? $indexLocalFiles[$md5] : '';							} else $localPath=false;							$list[] = array(								'tag' => $v,								'href' => $params[0]['href'],								'localPath' => $localPath							);						}					break;				}			}		}		return $list;	}	/******************************************	 *	 * Indexing; external URL	 *	 ******************************************/	/**	 * Index External URLs HTML content	 *	 * @param	string		URL, eg. "http://typo3.org/"	 * @return	void	 * @see indexRegularDocument()	 */	function indexExternalUrl($externalUrl)	{			// Parse External URL:		$qParts = parse_url($externalUrl);		$fI = pathinfo($qParts['path']);		$ext = strtolower($fI['extension']);			// Get headers:		$urlHeaders = $this->getUrlHeaders($externalUrl);		if (stristr($urlHeaders['Content-Type'],'text/html'))	{			$content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);			if (strlen($content))	{					// Create temporary file:				$tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html';				t3lib_div::writeFile($tmpFile, $content);					// Index that file:				$this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');	// Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)				unlink($tmpFile);			}		}	}	/**	 * Getting HTTP request headers of URL	 *	 * @param	string		The URL	 * @param	integer		Timeout (seconds?)	 * @return	mixed		If no answer, returns false. Otherwise an array where HTTP headers are keys	 */	function getUrlHeaders($url)	{		$content = t3lib_div::getURL($url,2);	// Try to get the headers only		if (strlen($content))	{				// Compile headers:			$headers = t3lib_div::trimExplode(chr(10),$content,1);			$retVal = array();			foreach($headers as $line)	{				if (!strlen(trim($line)))	{					break;	// Stop at the first empty line (= end of header)				}				list($headKey, $headValue) = explode(':', $line, 2);				$retVal[$headKey] = $headValue;			}			return $retVal;		}	}	/******************************************	 *	 * Indexing; external files (PDF, DOC, etc)	 *	 ******************************************/	/**	 * Indexing a regular document given as $file (relative to PATH_site, local file)	 *	 * @param	string		Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL	 * @param	boolean		If set, indexing is forced (despite content hashes, mtime etc).	 * @param	string		Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.	 * @param	string		File extension for temporary file.	 * @return	void	 */	function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')	{			// Init		$fI = pathinfo($file);		$ext = $altExtension ? $altExtension : strtolower($fI['extension']);			// Create abs-path:		if (!$contentTmpFile)	{			if (!t3lib_div::isAbsPath($file))	{	// Relative, prepend PATH_site:				$absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);			} else {	// Absolute, pass-through:				$absFile = $file;			}			$absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';		} else {			$absFile = $contentTmpFile;		}			// Indexing the document:		if ($absFile && @is_file($absFile))	{			if ($this->external_parsers[$ext])	{				$mtime = filemtime($absFile);				$cParts = $this->fileContentParts($ext,$absFile);				foreach($cParts as $cPKey)	{					$this->internal_log = array();					$this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');					$Pstart = t3lib_div::milliseconds();					$subinfo = array('key' => $cPKey);	// Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"					$phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);					$check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);					if ($check > 0 || $force)	{						if ($check > 0)	{							$this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);						} else {							$this->log_setTSlogMessage('Indexing forced by flag',1);						}							// Check external file counter:						if ($this->externalFileCounter < $this->maxExternalFiles || $force)	{									// Divide into title,keywords,description and body:							$this->log_push('Split content','');								$contentParts = $this->readFileContent($ext,$absFile,$cPKey);							$this->log_pull();							if (is_array($contentParts))	{									// Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())								$content_md5h = $this->md5inthash(implode($contentParts,''));								if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force)	{										// Increment counter:									$this->externalFileCounter++;										// Splitting words									$this->log_push('Extract words from content','');										$splitInWords = $this->processWordsInArrays($contentParts);									$this->log_pull();										// Analyse the indexed words.									$this->log_push('Analyse the extracted words','');										$indexArr = $this->indexAnalyze($splitInWords);									$this->log_pull();										// Submitting page (phash) record									$this->log_push('Submitting page','');										$size = filesize($absFile);										$ctime = filemtime($absFile);	// Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...										$this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);									$this->log_pull();										// Check words and submit to word list if not there									$this->log_push('Check word list and submit words','');										$this->checkWordList($indexArr);										$this->submitWords($indexArr,$phash_arr['phash']);									$this->log_pull();										// Set parsetime									$this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);								} else {									$this->updateTstamp($phash_arr['phash'],$mtime);	// Update the timestamp									$this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');								}							} else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');						} else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');					} else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);						// Checking and setting sections:		#			$this->submitFile_grlist($phash_arr['phash']);	// Setting a gr_list record if there is none already (set for default fe_group)					$this->submitFile_section($phash_arr['phash']);		// Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.					$this->log_pull();				}			} else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');		} else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');	}	/**	 * Reads the content of an external file being indexed.	 * The content from the external parser MUST be returned in utf-8!	 *	 * @param	string		File extension, eg. "pdf", "doc" etc.	 * @param	string		Absolute filename of file (must exist and be validated OK before calling function)	 * @param	string		Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)	 * @return	array		Standard content array (title, description, keywords, body keys)	 */	function readFileContent($ext,$absFile,$cPKey)	{			// Consult relevant external document parser:
💿 文件大小 8829 K
👤 上传用户 horse2000
📂 所属分类企业管理
🏷️ 相关标签

#Typo #开源
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -