⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 class.indexer.php

📁 Typo3, 开源里边最强大的
💻 PHP
📖 第 1 页 / 共 5 页
字号:
<?php/****************************************************************  Copyright notice**  (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)*  All rights reserved**  This script is part of the TYPO3 project. The TYPO3 project is*  free software; you can redistribute it and/or modify*  it under the terms of the GNU General Public License as published by*  the Free Software Foundation; either version 2 of the License, or*  (at your option) any later version.**  The GNU General Public License can be found at*  http://www.gnu.org/copyleft/gpl.html.*  A copy is found in the textfile GPL.txt and important notices to the license*  from the author is found in LICENSE.txt distributed with these scripts.***  This script is distributed in the hope that it will be useful,*  but WITHOUT ANY WARRANTY; without even the implied warranty of*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the*  GNU General Public License for more details.**  This copyright notice MUST APPEAR in all copies of the script!***************************************************************//** * This class is a search indexer for TYPO3 * * @author	Kasper Sk錼h鴍 <kasperYYYY@typo3.com> * Originally Christian Jul Jensen <christian@jul.net> helped as well. *//** * [CLASS/FUNCTION INDEX of SCRIPT] * * * *  141: class tx_indexedsearch_indexer *  207:     function hook_indexContent(&$pObj) * *              SECTION: Backend API *  308:     function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) *  347:     function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) *  365:     function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) * *              SECTION: Initialization *  416:     function init() *  468:     function initializeExternalParsers() * *              SECTION: Indexing; TYPO3 pages (HTML content) *  509:     function indexTypo3PageContent() *  596:     function splitHTMLContent($content) *  642:     function getHTMLcharset($content) *  657:     function convertHTMLToUtf8($content,$charset='') *  685:     function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) *  712:     function typoSearchTags(&$body) *  741:     function extractLinks($content) *  812:     function extractHyperLinks($string) * *              SECTION: Indexing; external URL *  871:     function indexExternalUrl($externalUrl) *  902:     function getUrlHeaders($url) * *              SECTION: Indexing; external files (PDF, DOC, etc) *  948:     function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') * 1054:     function readFileContent($ext,$absFile,$cPKey) * 1071:     function fileContentParts($ext,$absFile) * 1089:     function splitRegularContent($content) * *              SECTION: Analysing content, Extracting words * 1122:     function charsetEntity2utf8(&$contentArr, $charset) * 1145:     function processWordsInArrays($contentArr) * 1170:     function procesWordsInArrays($contentArr) * 1180:     function bodyDescription($contentArr) * 1202:     function indexAnalyze($content) * 1223:     function analyzeHeaderinfo(&$retArr,$content,$key,$offset) * 1242:     function analyzeBody(&$retArr,$content) * 1262:     function metaphone($word,$retRaw=FALSE) * *              SECTION: SQL; TYPO3 Pages * 1304:     function submitPage() * 1378:     function submit_grlist($hash,$phash_x) * 1398:     function submit_section($hash,$hash_t3) * 1416:     function removeOldIndexedPages($phash) * *              SECTION: SQL; External media * 1459:     function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) * 1525:     function submitFile_grlist($hash) * 1539:     function submitFile_section($hash) * 1553:     function removeOldIndexedFiles($phash) * *              SECTION: SQL Helper functions * 1589:     function checkMtimeTstamp($mtime,$phash) * 1625:     function checkContentHash() * 1642:     function checkExternalDocContentHash($hashGr,$content_md5h) * 1656:     function is_grlist_set($phash_x) * 1669:     function update_grlist($phash,$phash_x) * 1684:     function updateTstamp($phash,$mtime=0) * 1699:     function updateSetId($phash) * 1714:     function updateParsetime($phash,$parsetime) * 1727:     function updateRootline() * 1742:     function getRootLineFields(&$fieldArr) * 1761:     function removeLoginpagesWithContentHash() * 1778:     function includeCrawlerClass() * *              SECTION: SQL; Submitting words * 1805:     function checkWordList($wl) * 1842:     function submitWords($wl,$phash) * 1866:     function freqMap($freq) * *              SECTION: Hashing * 1899:     function setT3Hashes() * 1925:     function setExtHashes($file,$subinfo=array()) * 1949:     function md5inthash($str) * 1959:     function makeCHash($paramArray) * *              SECTION: Internal logging functions * 1991:     function log_push($msg,$key) * 2000:     function log_pull() * 2011:     function log_setTSlogMessage($msg, $errorNum=0) * *              SECTION: tslib_fe hooks: * 2036:     function fe_headerNoCache(&$params, $ref) * * TOTAL FUNCTIONS: 59 * (This index is automatically created/updated by the extension "extdeveval") * */require_once(PATH_t3lib.'class.t3lib_parsehtml.php');/** * Indexing class for TYPO3 frontend * * @author	Kasper Skaarhoj <kasperYYYY@typo3.com> * @package TYPO3 * @subpackage tx_indexedsearch */class tx_indexedsearch_indexer {		// Messages:	var $reasons = array(		-1 => 'mtime matched the document, so no changes detected and no content updated',		-2 => 'The minimum age was not exceeded',		1 => "The configured max-age was exceeded for the document and thus it's indexed.",		2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',		3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',		4 => 'Page has never been indexed (is not represented in the index_phash table).'	);		// HTML code blocks to exclude from indexing:	var $excludeSections = 'script,style';		// Supported Extensions for external files:	var $external_parsers = array();		// External parser objects, keys are file extension names. Values are objects with certain methods.		// Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)	var $defaultGrList = '0,-1';		// Min/Max times:	var $tstamp_maxAge = 0;		// If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.	var $tstamp_minAge = 0;		// If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.	var $maxExternalFiles = 0;	// Max number of external files to index.	var $forceIndexing = FALSE;		// If true, indexing is forced despite of hashes etc.	var $crawlerActive = FALSE;		// Set when crawler is detected (internal)		// INTERNALS:	var $defaultContentArray=array(		'title' => '',		'description' => '',		'keywords' => '',		'body' => '',	);	var $wordcount = 0;	var $externalFileCounter = 0;	var $conf = array();		// Configuration set internally (see init functions for required keys and their meaning)	var $indexerConfig = array();	// Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']	var $hash = array();		// Hash array, contains phash and phash_grouping	var $file_phash_arr = array();	// Hash array for files	var $contentParts = array();	// Content of TYPO3 page	var $content_md5h = '';	var $internal_log = array();	// Internal log	var $indexExternalUrl_content = '';	var $cHashParams = array();	// cHashparams array	var $freqRange = 32000;	var $freqMax = 0.1;		// Objects:	var $csObj;				// Charset class object , t3lib_cs	var $metaphoneObj;		// Metaphone object, if any	var $lexerObj;			// Lexer object for word splitting	/**	 * Parent Object (TSFE) Initialization	 *	 * @param	object		Parent Object (frontend TSFE object), passed by reference	 * @return	void	 */	function hook_indexContent(&$pObj)	{			// Indexer configuration from Extension Manager interface:		$indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);			// Crawler activation:			// Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:		if (t3lib_extMgm::isLoaded('crawler')				&& $pObj->applicationData['tx_crawler']['running']				&& in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions']))	{				// Setting simple log message:			$pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';				// Setting variables:			$this->crawlerActive = TRUE;	// Crawler active flag			$this->forceIndexing = TRUE;	// Force indexing despite timestamps etc.		}			// Determine if page should be indexed, and if so, configure and initialize indexer		if ($pObj->config['config']['index_enable'])	{			$this->log_push('Index page','');			if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive)	{				if (!$pObj->page['no_search'])	{					if (!$pObj->no_cache)	{						if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content))	{								// Setting up internal configuration from config array:							$this->conf = array();								// Information about page for which the indexing takes place							$this->conf['id'] = $pObj->id;				// Page id							$this->conf['type'] = $pObj->type;			// Page type							$this->conf['sys_language_uid'] = $pObj->sys_language_uid;	// sys_language UID of the language of the indexing.							$this->conf['MP'] = $pObj->MP;				// MP variable, if any (Mount Points)							$this->conf['gr_list'] = $pObj->gr_list;	// Group list							$this->conf['cHash'] = $pObj->cHash;					// cHash string for additional parameters							$this->conf['cHash_array'] = $pObj->cHash_array;		// Array of the additional parameters							$this->conf['crdate'] = $pObj->page['crdate'];			// The creation date of the TYPO3 page							$this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;	// reg1 of the caching table. Not known what practical use this has.								// Root line uids							$this->conf['rootline_uids'] = array();							foreach($pObj->config['rootLine'] as $rlkey => $rldat)	{								$this->conf['rootline_uids'][$rlkey] = $rldat['uid'];							}								// Content of page:							$this->conf['content'] = $pObj->content;					// Content string (HTML of TYPO3 page)							$this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);	// Alternative title for indexing							$this->conf['metaCharset'] = $pObj->metaCharset;			// Character set of content (will be converted to utf-8 during indexing)							$this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'];	// Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.								// Configuration of behavior:							$this->conf['index_externals'] = $pObj->config['config']['index_externals'];	// Whether to index external documents like PDF, DOC etc. (if possible)							$this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];		// Length of description text (max 250, default 200)								// Set to zero:							$this->conf['recordUid'] = 0;							$this->conf['freeIndexUid'] = 0;							$this->conf['freeIndexSetId'] = 0;								// Init and start indexing:							$this->init();							$this->indexTypo3PageContent();						} else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');					} else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');				} else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');			} else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');			$this->log_pull();		}	}	/****************************	 *	 * Backend API	 *	 ****************************/	/**	 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)	 *	 * @param	integer		The page uid, &id=	 * @param	integer		The page type, &type=	 * @param	integer		sys_language uid, typically &L=	 * @param	string		The MP variable (Mount Points), &MP=	 * @param	array		Rootline array of only UIDs.	 * @param	array		Array of GET variables to register with this indexing	 * @param	boolean		If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!	 * @return	void	 */	function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)	{			// Setting up internal configuration from config array:		$this->conf = array();			// Information about page for which the indexing takes place		$this->conf['id'] = $id;				// Page id	(integer)		$this->conf['type'] = $type;			// Page type (integer)		$this->conf['sys_language_uid'] = $sys_language_uid;	// sys_language UID of the language of the indexing (integer)		$this->conf['MP'] = $MP;				// MP variable, if any (Mount Points) (string)		$this->conf['gr_list'] = '0,-1';	// Group list (hardcoded for now...)			// cHash values:		$this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : '';	// cHash string for additional parameters		$this->conf['cHash_array'] = $cHash_array;		// Array of the additional parameters			// Set to defaults		$this->conf['freeIndexUid'] = 0;		$this->conf['freeIndexSetId'] = 0;		$this->conf['page_cache_reg1'] = '';			// Root line uids		$this->conf['rootline_uids'] = $uidRL;			// Configuration of behavior:		$this->conf['index_externals'] = 1;	// Whether to index external documents like PDF, DOC etc. (if possible)		$this->conf['index_descrLgd'] = 200;		// Length of description text (max 250, default 200)			// Init and start indexing:		$this->init();	}	/**	 * Sets the free-index uid. Can be called right after backend_initIndexer()	 *	 * @param	integer		Free index UID	 * @param	integer		Set id - an integer identifying the "set" of indexing operations.	 * @return	void	 */	function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)	{		$this->conf['freeIndexUid'] = $freeIndexUid;		$this->conf['freeIndexSetId'] = $freeIndexSetId;	}	/**	 * Indexing records as the content of a TYPO3 page.	 *	 * @param	string		Title equivalent	 * @param	string		Keywords equivalent	 * @param	string		Description equivalent

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -