📄 class.indexer.php

📁 Typo3, 开源里边最强大的
💻 PHP
📖 第 1 页 / 共 5 页
字号:
		if (is_object($this->external_parsers[$ext]))	{			$contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);		}		return $contentArr;	}	/**	 * Creates an array with pointers to divisions of document.	 *	 * @param	string		File extension	 * @param	string		Absolute filename (must exist and be validated OK before calling function)	 * @return	array		Array of pointers to sections that the document should be divided into	 */	function fileContentParts($ext,$absFile)	{		$cParts = array(0);			// Consult relevant external document parser:		if (is_object($this->external_parsers[$ext]))	{			$cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);		}		return $cParts;	}	/**	 * Splits non-HTML content (from external files for instance)	 *	 * @param	string		Input content (non-HTML) to index.	 * @return	array		Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)	 * @see splitHTMLContent()	 */	function splitRegularContent($content) {		$contentArr = $this->defaultContentArray;		$contentArr['body'] = $content;		return $contentArr;	}	/**********************************	 *	 * Analysing content, Extracting words	 *	 **********************************/	/**	 * Convert character set and HTML entities in the value of input content array keys	 *	 * @param	array		Standard content array	 * @param	string		Charset of the input content (converted to utf-8)	 * @return	void	 */	function charsetEntity2utf8(&$contentArr, $charset)	{			// Convert charset if necessary		reset($contentArr);		while(list($key,)=each($contentArr)) {			if (strlen($contentArr[$key]))	{				if ($charset!=='utf-8')	{					$contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);				}					// decode all numeric / html-entities in the string to real characters:				$contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);			}		}	}	/**	 * Processing words in the array from split*Content -functions	 *	 * @param	array		Array of content to index, see splitHTMLContent() and splitRegularContent()	 * @return	array		Content input array modified so each key is not a unique array of words	 */	function processWordsInArrays($contentArr)	{			// split all parts to words		reset($contentArr);		while(list($key,)=each($contentArr)) {			$contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);		}			// For title, keywords, and description we don't want duplicates:		$contentArr['title'] = array_unique($contentArr['title']);		$contentArr['keywords'] = array_unique($contentArr['keywords']);		$contentArr['description'] = array_unique($contentArr['description']);			// Return modified array:		return $contentArr;	}	/**	 * Processing words in the array from split*Content -functions	 * This function is only a wrapper because the function has been removed (see above).	 *	 * @param	array		Array of content to index, see splitHTMLContent() and splitRegularContent()	 * @return	array		Content input array modified so each key is not a unique array of words	 * @deprecated	 */	function procesWordsInArrays($contentArr)	{		return $this->processWordsInArrays($contentArr);	}	/**	 * Extracts the sample description text from the content array.	 *	 * @param	array		Content array	 * @return	string		Description string	 */	function bodyDescription($contentArr)	{			// Setting description		$maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);		if ($maxL)	{				// Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.	#		$bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));			$bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']);				// Shorten the string:			$bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);		}		return $bodyDescription;	}	/**	 * Analyzes content to use for indexing,	 *	 * @param	array		Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.	 * @return	array		Index Array (whatever that is...)	 */	function indexAnalyze($content) {		$indexArr = Array();		$counter = 0;		$this->analyzeHeaderinfo($indexArr,$content,'title',7);		$this->analyzeHeaderinfo($indexArr,$content,'keywords',6);		$this->analyzeHeaderinfo($indexArr,$content,'description',5);		$this->analyzeBody($indexArr,$content);		return ($indexArr);	}	/**	 * Calculates relevant information for headercontent	 *	 * @param	array		Index array, passed by reference	 * @param	array		Standard content array	 * @param	string		Key from standard content array	 * @param	integer		Bit-wise priority to type	 * @return	void	 */	function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {		reset($content[$key]);		while(list(,$val)=each($content[$key]))  {			$val = substr($val,0,60);	// Max 60 - because the baseword varchar IS 60. This MUST be the same.			$retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);			$retArr[$val]['count'] = $retArr[$val]['count']+1;			$retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));			$retArr[$val]['metaphone'] = $this->metaphone($val);			$this->wordcount++;		}	}	/**	 * Calculates relevant information for bodycontent	 *	 * @param	array		Index array, passed by reference	 * @param	array		Standard content array	 * @return	void	 */	function analyzeBody(&$retArr,$content) {		foreach($content['body'] as $key => $val)	{			$val = substr($val,0,60);	// Max 60 - because the baseword varchar IS 60. This MUST be the same.			if(!isset($retArr[$val])) {				$retArr[$val]['first'] = $key;				$retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));				$retArr[$val]['metaphone'] = $this->metaphone($val);			}			$retArr[$val]['count'] = $retArr[$val]['count']+1;			$this->wordcount++;		}	}	/**	 * Creating metaphone based hash from input word	 *	 * @param	string		Word to convert	 * @param	boolean		If set, returns the raw metaphone value (not hashed)	 * @return	mixed		Metaphone hash integer (or raw value, string)	 */	function metaphone($word,$retRaw=FALSE) {		if (is_object($this->metaphoneObj))	{			$tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);		} else {			$tmp = metaphone($word);		}			// Return raw value?		if ($retRaw)	return $tmp;			// Otherwise create hash and return integer		if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));		return $ret;	}	/********************************	 *	 * SQL; TYPO3 Pages	 *	 *******************************/	/**	 * Updates db with information about the page (TYPO3 page, not external media)	 *	 * @return	void	 */	function submitPage()	{			// Remove any current data for this phash:		$this->removeOldIndexedPages($this->hash['phash']);			// setting new phash_row		$fields = array(			'phash' => $this->hash['phash'],			'phash_grouping' => $this->hash['phash_grouping'],			'cHashParams' => serialize($this->cHashParams),			'contentHash' => $this->content_md5h,			'data_page_id' => $this->conf['id'],			'data_page_reg1' => $this->conf['page_cache_reg1'],			'data_page_type' => $this->conf['type'],			'data_page_mp' => $this->conf['MP'],			'gr_list' => $this->conf['gr_list'],			'item_type' => 0,	// TYPO3 page			'item_title' => $this->contentParts['title'],			'item_description' => $this->bodyDescription($this->contentParts),			'item_mtime' => $this->conf['mtime'],			'item_size' => strlen($this->conf['content']),			'tstamp' => time(),			'crdate' => time(),			'item_crdate' => $this->conf['crdate'],	// Creation date of page			'sys_language_uid' => $this->conf['sys_language_uid'],	// Sys language uid of the page. Should reflect which language it DOES actually display! 			'externalUrl' => 0, 			'recordUid' => intval($this->conf['recordUid']), 			'freeIndexUid' => intval($this->conf['freeIndexUid']), 			'freeIndexSetId' => intval($this->conf['freeIndexSetId']),		);		$GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);			// PROCESSING index_section		$this->submit_section($this->hash['phash'],$this->hash['phash']);			// PROCESSING index_grlist		$this->submit_grlist($this->hash['phash'],$this->hash['phash']);			// PROCESSING index_fulltext		$fields = array(			'phash' => $this->hash['phash'],			'fulltextdata' => implode(' ', $this->contentParts)		);		if ($this->indexerConfig['fullTextDataLength']>0)	{			$fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);		}		$GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);			// PROCESSING index_debug		if ($this->indexerConfig['debugMode'])	{			$fields = array(				'phash' => $this->hash['phash'],				'debuginfo' => serialize(array(						'cHashParams' => $this->cHashParams,						'external_parsers initialized' => array_keys($this->external_parsers),						'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),						'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),						'logs' => $this->internal_log,						'lexer' => $this->lexerObj->debugString,					))			);			$GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);		}	}	/**	 * Stores gr_list in the database.	 *	 * @param	integer		Search result record phash	 * @param	integer		Actual phash of current content	 * @return	void	 * @see update_grlist()	 */	function submit_grlist($hash,$phash_x)	{			// Setting the gr_list record		$fields = array(			'phash' => $hash,			'phash_x' => $phash_x,			'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),			'gr_list' => $this->conf['gr_list']		);		$GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);	}	/**	 * Stores section	 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.	 *	 * @param	integer		phash of TYPO3 parent search result record	 * @param	integer		phash of the file indexation search record	 * @return	void	 */	function submit_section($hash,$hash_t3)	{		$fields = array(			'phash' => $hash,			'phash_t3' => $hash_t3,			'page_id' => intval($this->conf['id'])		);		$this->getRootLineFields($fields);		$GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);	}	/**	 * Removes records for the indexed page, $phash	 *	 * @param	integer		phash value to flush
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -