📄 class.indexer.php

📁 Typo3, 开源里边最强大的
💻 PHP
📖 第 1 页 / 共 5 页
字号:
	 * @param	string		The main content to index	 * @param	string		The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!	 * @param	integer		Last modification time, in seconds	 * @param	integer		The creation date of the content, in seconds	 * @param	integer		The record UID that the content comes from (for registration with the indexed rows)	 * @return	void	 */	function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)	{			// Content of page:		$this->conf['mtime'] = $mtime;			// Most recent modification time (seconds) of the content		$this->conf['crdate'] = $crdate;		// The creation date of the TYPO3 content		$this->conf['recordUid'] = $recordUid;	// UID of the record, if applicable			// Construct fake HTML for parsing:		$this->conf['content'] = '		<html>			<head>				<title>'.htmlspecialchars($title).'</title>				<meta name="keywords" content="'.htmlspecialchars($keywords).'" />				<meta name="description" content="'.htmlspecialchars($description).'" />			</head>			<body>				'.htmlspecialchars($content).'			</body>		</html>';					// Content string (HTML of TYPO3 page)			// Initializing charset:		$this->conf['metaCharset'] = $charset;			// Character set of content (will be converted to utf-8 during indexing)		$this->conf['indexedDocTitle'] = '';	// Alternative title for indexing			// Index content as if it was a TYPO3 page:		$this->indexTypo3PageContent();	}	/********************************	 *	 * Initialization	 *	 *******************************/	/**	 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!	 *	 * @return	void	 */	function init()	{		global $TYPO3_CONF_VARS;			// Initializing:		$this->cHashParams = $this->conf['cHash_array'];		if (is_array($this->cHashParams) && count($this->cHashParams))	{			if ($this->conf['cHash'])	$this->cHashParams['cHash'] = $this->conf['cHash'];	// Add this so that URL's come out right...			unset($this->cHashParams['encryptionKey']);		// encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!		}			// Setting phash / phash_grouping which identifies the indexed page based on some of these variables:		$this->setT3Hashes();			// Indexer configuration from Extension Manager interface:		$this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);		$this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);		$this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);		$this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);		$this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);			// Initialize external document parsers:			// Example configuration, see ext_localconf.php of this file!		if ($this->conf['index_externals'])	{			$this->initializeExternalParsers();		}			// Initialize lexer (class that deconstructs the text into words):			// Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';		$lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?						$TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :						'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';		$this->lexerObj = &t3lib_div::getUserObj($lexerObjRef);		$this->lexerObj->debug = $this->indexerConfig['debugMode'];			// Initialize metaphone hook:			// Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';		if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'])	{			$this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);			$this->metaphoneObj->pObj = &$this;		}			// Init charset class:		$this->csObj = &t3lib_div::makeInstance('t3lib_cs');	}	/**	 * Initialize external parsers	 *	 * @return	void	 * @access private	 * @see init()	 */	function initializeExternalParsers()	{		global $TYPO3_CONF_VARS;		if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers']))	{			foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef)	{				$this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef);				$this->external_parsers[$extension]->pObj = &$this;					// Init parser and if it returns false, unset its entry again:				if (!$this->external_parsers[$extension]->initParser($extension))	{					unset($this->external_parsers[$extension]);				}			}		}	}	/********************************	 *	 * Indexing; TYPO3 pages (HTML content)	 *	 *******************************/	/**	 * Start indexing of the TYPO3 page	 *	 * @return	void	 */	function indexTypo3PageContent()	{		$check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);		$is_grlist = $this->is_grlist_set($this->hash['phash']);		if ($check > 0 || !$is_grlist || $this->forceIndexing)	{				// Setting message:			if ($this->forceIndexing)	{				$this->log_setTSlogMessage('Indexing needed, reason: Forced',1);			} elseif ($check > 0)	{				$this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);			} else {				$this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);			}					// Divide into title,keywords,description and body:			$this->log_push('Split content','');				$this->contentParts = $this->splitHTMLContent($this->conf['content']);				if ($this->conf['indexedDocTitle'])	{					$this->contentParts['title'] = $this->conf['indexedDocTitle'];				}			$this->log_pull();				// Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)			$this->content_md5h = $this->md5inthash(implode($this->contentParts,''));				// This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.				// If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.				// This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.			$checkCHash = $this->checkContentHash();			if (!is_array($checkCHash) || $check===1)	{				$Pstart=t3lib_div::milliseconds();				$this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');					$this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);				$this->log_pull();						// Splitting words				$this->log_push('Extract words from content','');					$splitInWords = $this->processWordsInArrays($this->contentParts);				$this->log_pull();						// Analyse the indexed words.				$this->log_push('Analyse the extracted words','');					$indexArr = $this->indexAnalyze($splitInWords);				$this->log_pull();						// Submitting page (phash) record				$this->log_push('Submitting page','');					$this->submitPage();				$this->log_pull();						// Check words and submit to word list if not there				$this->log_push('Check word list and submit words','');					$this->checkWordList($indexArr);					$this->submitWords($indexArr,$this->hash['phash']);				$this->log_pull();						// Set parsetime				$this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);						// Checking external files if configured for.				$this->log_push('Checking external files','');				if ($this->conf['index_externals'])	{					$this->extractLinks($this->conf['content']);				}				$this->log_pull();			} else {				$this->updateTstamp($this->hash['phash'],$this->conf['mtime']);	// Update the timestatmp				$this->updateSetId($this->hash['phash']);				$this->update_grlist($checkCHash['phash'],$this->hash['phash']);	// $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.				$this->updateRootline();				$this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');			}		} else {			$this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);		}	}	/**	 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.	 *	 * @param	string		HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")	 * @return	array		Array of content, having keys "title", "body", "keywords" and "description" set.	 * @see splitRegularContent()	 */	function splitHTMLContent($content) {			// divide head from body ( u-ouh :) )		$contentArr = $this->defaultContentArray;		$contentArr['body'] = stristr($content,'<body');		$headPart = substr($content,0,-strlen($contentArr['body']));			// get title		$this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);		$titleParts = explode(':',$contentArr['title'],2);		$contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);			// get keywords and description metatags		for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }		for($i=0;isset($meta[$i]);$i++) {			$meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);			if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content'];			if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];		}			// Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:		$this->typoSearchTags($contentArr['body']);			// Get rid of unwanted sections (ie. scripting and style stuff) in body		$tagList = explode(',',$this->excludeSections);		foreach($tagList as $tag)	{			while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));		}			// remove tags, but first make sure we don't concatenate words by doing it		$contentArr['body'] = str_replace('<',' <',$contentArr['body']);		$contentArr['body'] = trim(strip_tags($contentArr['body']));		$contentArr['keywords'] = trim($contentArr['keywords']);		$contentArr['description'] = trim($contentArr['description']);			// Return array		return $contentArr;	}	/**	 * Extract the charset value from HTML meta tag.	 *	 * @param	string		HTML content	 * @return	string		The charset value if found.	 */	function getHTMLcharset($content)	{		if (eregi('<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>',$content,$reg))	{			if (eregi('charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)',$reg[0],$reg2))	{				return $reg2[1];			}		}	}	/**	 * Converts a HTML document to utf-8	 *	 * @param	string		HTML content, any charset	 * @param	string		Optional charset (otherwise extracted from HTML)	 * @return	string		Converted HTML	 */	function convertHTMLToUtf8($content,$charset='')	{			// Find charset:		$charset = $charset ? $charset : $this->getHTMLcharset($content);		$charset = $this->csObj->parse_charset($charset);			// Convert charset:		if ($charset && $charset!=='utf-8')	{			$content = $this->csObj->utf8_encode($content, $charset);		}			// Convert entities, assuming document is now UTF-8:		$content = $this->csObj->entities_to_utf8($content, TRUE);		return $content;	}	/**	 * Finds first occurence of embracing tags and returns the embraced content and the original string with	 * the tag removed in the two passed variables. Returns false if no match found. ie. useful for finding	 * <title> of document or removing <script>-sections	 *	 * @param	string		String to search in	 * @param	string		Tag name, eg. "script"	 * @param	string		Passed by reference: Content inside found tag	 * @param	string		Passed by reference: Content after found tag	 * @param	string		Passed by reference: Attributes of the found tag.	 * @return	boolean		Returns false if tag was not found, otherwise true.	 */	function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {		$endTag = '</'.$tagName.'>';		$startTag = '<'.$tagName;		$isTagInText = stristr($string,$startTag);		// stristr used because we want a case-insensitive search for the tag.		if(!$isTagInText) return false;	// if the tag was not found, return false		list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);		$afterTagInText = stristr($isTagInText,$endTag);		if ($afterTagInText)	{			$stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));			$tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));			$stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));		} else {	// If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.			$tagContent='';			$stringAfter = $isTagInText;		}		return true;	}	/**	 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.	 *	 * @param	string		HTML Content, passed by reference	 * @return	boolean		Returns true if a TYPOSEARCH_ tag was found, otherwise false.	 */	function typoSearchTags(&$body) {		$expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
💿 文件大小 8829 K
👤 上传用户 horse2000
📂 所属分类企业管理
🏷️ 相关标签

#Typo #开源
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -