⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 class.crawler.php

📁 Typo3, 开源里边最强大的
💻 PHP
📖 第 1 页 / 共 3 页
字号:
					// Finally, set entry for next indexing of batch of records:				$nparams = array(					'indexConfigUid' => $cfgRec['uid'],					'url' => 'Records from UID#'.($r['uid']+1).'-?',					'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')				);				$pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);			}		}	}	/**	 * Indexing files from fileadmin	 *	 * @param	array		Indexing Configuration Record	 * @param	array		Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!	 * @param	array		Parameters from the log queue.	 * @param	object		Parent object (from "crawler" extension!)	 * @return	void	 */	function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj)	{			// Prepare path, making it absolute and checking:		$readpath = $params['url'];		if (!t3lib_div::isAbsPath($readpath))	{			$readpath = t3lib_div::getFileAbsFileName($readpath);		}		if (t3lib_div::isAllowedAbsPath($readpath))	{			if (@is_file($readpath))	{	// If file, index it!					// Get root line (need to provide this when indexing external files)				$rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);					// Load indexer if not yet.				$this->loadIndexerClass();					// (Re)-Indexing file on page.				$indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');				$indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);				$indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);				$indexerObj->hash['phash'] = -1;	// EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)					// Index document:				$indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE);			} elseif (@is_dir($readpath)) {	// If dir, read content and create new pending items for log:					// Select files and directories in path:				$extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1));				$fileArr = array();				$files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0);				$directoryList = t3lib_div::get_dirs($readpath);				if (is_array($directoryList) && $params['depth'] < $cfgRec['depth'])	{					foreach ($directoryList as $subdir)	{						if ((string)$subdir!='')	{							$files[]= $readpath.$subdir.'/';						}					}				}				$files = t3lib_div::removePrefixPathFromList($files,PATH_site);					// traverse the items and create log entries:				foreach($files as $path)	{					$this->instanceCounter++;					if ($path!==$params['url'])	{							// Parameters:						$nparams = array(							'indexConfigUid' => $cfgRec['uid'],							'url' => $path,							'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),							'depth' => $params['depth']+1						);						$pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);					}				}			}		}	}	/**	 * Indexing External URLs	 *	 * @param	array		Indexing Configuration Record	 * @param	array		Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!	 * @param	array		Parameters from the log queue.	 * @param	object		Parent object (from "crawler" extension!)	 * @return	void	 */	function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj)	{			// Init session data array if not already:		if (!is_array($session_data))	{			$session_data = array(				'urlLog' => array($params['url'])			);		}			// Index the URL:		$rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);		$subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);			// Add more elements to log now:		if ($params['depth'] < $cfgRec['depth'])	{			foreach($subUrls as $url)	{				if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl']))	{					if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny']))	{						$this->instanceCounter++;						$session_data['urlLog'][] = $url;							// Parameters:						$nparams = array(							'indexConfigUid' => $cfgRec['uid'],							'url' => $url,							'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),							'depth' => $params['depth']+1						);						$pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);					}				}			}		}	}	/**	 * Page tree indexing type	 *	 * @param	array		Indexing Configuration Record	 * @param	array		Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!	 * @param	array		Parameters from the log queue.	 * @param	object		Parent object (from "crawler" extension!)	 * @return	void	 */	function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj)	{			// Base page uid:		$pageUid = intval($params['url']);			// Get array of URLs from page:		$pageRow = t3lib_BEfunc::getRecord('pages',$pageUid);		$res = $pObj->getUrlsForPageRow($pageRow);		$duplicateTrack = array();	// Registry for duplicates		$downloadUrls = array();	// Dummy.			// Submit URLs:		if (count($res))	{			foreach($res as $paramSetKey => $vv)	{				$urlList = $pObj->urlListFromUrlArray($vv,$pageRow,time(),30,1,0,$duplicateTrack,$downloadUrls,array('tx_indexedsearch_reindex'));			}		}			// Add subpages to log now:		if ($params['depth'] < $cfgRec['depth'])	{				// Subpages selected			$recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(				'uid,title',				'pages',				'pid = '.intval($pageUid).					t3lib_BEfunc::deleteClause('pages')			);				// Traverse subpages and add to queue:			if (count($recs))	{				foreach($recs as $r)	{					$this->instanceCounter++;					$url = 'pages:'.$r['uid'].': '.$r['title'];					$session_data['urlLog'][] = $url;							// Parameters:					$nparams = array(						'indexConfigUid' => $cfgRec['uid'],						'url' => $r['uid'],						'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),						'depth' => $params['depth']+1					);					$pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);				}			}		}	}	/**	 * Look up all old index configurations which are finished and needs to be reset and done	 *	 * @return	void	 */	function cleanUpOldRunningConfigurations()	{			// Lookup running index configurations:		$runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(			'uid,set_id',			'index_config',			'set_id!=0'.t3lib_BEfunc::deleteClause('index_config')		);			// For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)		foreach($runningIndexingConfigurations as $cfgRec)	{				// Look for ended processes:			list($queued_items) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(				'count(*) AS count',				'tx_crawler_queue',				'set_id='.intval($cfgRec['set_id']).' AND exec_time=0'			);			if (!$queued_items['count'])	{					// Lookup old phash rows:				$oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(					'phash',					'index_phash',					'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id']				);				foreach($oldPhashRows as $pHashRow)	{						// Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)					$tableArr = explode(',','index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');					foreach($tableArr as $table)	{						$GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash']));					}				}					// End process by updating index-config record:				$field_array = array (					'set_id' => 0,					'session_data' => '',				);				$GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);			}		}	}	/*****************************************	 *	 * Helper functions	 *	 *****************************************/	/**	 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.	 *	 * @param	string		URL string to check	 * @param	array		Array of already indexed URLs (input url is looked up here and must not exist already)	 * @param	string		Base URL of the indexing process (input URL must be "inside" the base URL!)	 * @return	string		Returls the URL if OK, otherwise false	 */	function checkUrl($url,$urlLog,$baseUrl)	{		$url = ereg_replace('\/\/$','/',$url);		list($url) = explode('#',$url);		if (!strstr($url,'../'))	{			if (t3lib_div::isFirstPartOfStr($url,$baseUrl))	{				if (!in_array($url,$urlLog))	{					return $url;				}			}		}	}	/**	 * Indexing External URL	 *	 * @param	string		URL, http://....	 * @param	integer		Page id to relate indexing to.	 * @param	array		Rootline array to relate indexing to	 * @param	integer		Configuration UID	 * @param	integer		Set ID value	 * @return	array		URLs found on this page	 */	function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)	{			// Load indexer if not yet.		$this->loadIndexerClass();			// Index external URL:		$indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');		$indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);		$indexerObj->backend_setFreeIndexUid($cfgUid, $setId);		$indexerObj->hash['phash'] = -1;	// To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)		$indexerObj->indexExternalUrl($url);		$url_qParts = parse_url($url);			// Get URLs on this page:		$subUrls = array();		$list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);						// Traverse links:		foreach ($list as $count => $linkInfo)	{				// Decode entities:			$subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']);			$qParts = parse_url($subUrl);			if (!$qParts['scheme'])	{				$subUrl = $url_qParts['scheme'].'://'.$url_qParts['host'].'/'.t3lib_div::resolveBackPath($subUrl);			}			$subUrls[] = $subUrl;		}		return $subUrls;	}	/**	 * Indexing Single Record	 *	 * @param	array		Record to index	 * @param	array		Configuration Record	 * @param	array		Rootline array to relate indexing to	 * @return	void	 */	function indexSingleRecord($r,$cfgRec,$rl=NULL)	{			// Load indexer if not yet.		$this->loadIndexerClass();

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -