📄 class.crawler.php
字号:
// Finally, set entry for next indexing of batch of records: $nparams = array( 'indexConfigUid' => $cfgRec['uid'], 'url' => 'Records from UID#'.($r['uid']+1).'-?', 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']') ); $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']); } } } /** * Indexing files from fileadmin * * @param array Indexing Configuration Record * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call! * @param array Parameters from the log queue. * @param object Parent object (from "crawler" extension!) * @return void */ function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj) { // Prepare path, making it absolute and checking: $readpath = $params['url']; if (!t3lib_div::isAbsPath($readpath)) { $readpath = t3lib_div::getFileAbsFileName($readpath); } if (t3lib_div::isAllowedAbsPath($readpath)) { if (@is_file($readpath)) { // If file, index it! // Get root line (need to provide this when indexing external files) $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']); // Load indexer if not yet. $this->loadIndexerClass(); // (Re)-Indexing file on page. $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer'); $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl); $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']); $indexerObj->hash['phash'] = -1; // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!) // Index document: $indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE); } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log: // Select files and directories in path: $extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1)); $fileArr = array(); $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0); $directoryList = t3lib_div::get_dirs($readpath); if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) { foreach ($directoryList as $subdir) { if ((string)$subdir!='') { $files[]= $readpath.$subdir.'/'; } } } $files = t3lib_div::removePrefixPathFromList($files,PATH_site); // traverse the items and create log entries: foreach($files as $path) { $this->instanceCounter++; if ($path!==$params['url']) { // Parameters: $nparams = array( 'indexConfigUid' => $cfgRec['uid'], 'url' => $path, 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), 'depth' => $params['depth']+1 ); $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl); } } } } } /** * Indexing External URLs * * @param array Indexing Configuration Record * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call! * @param array Parameters from the log queue. * @param object Parent object (from "crawler" extension!) * @return void */ function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj) { // Init session data array if not already: if (!is_array($session_data)) { $session_data = array( 'urlLog' => array($params['url']) ); } // Index the URL: $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']); $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']); // Add more elements to log now: if ($params['depth'] < $cfgRec['depth']) { foreach($subUrls as $url) { if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl'])) { if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) { $this->instanceCounter++; $session_data['urlLog'][] = $url; // Parameters: $nparams = array( 'indexConfigUid' => $cfgRec['uid'], 'url' => $url, 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), 'depth' => $params['depth']+1 ); $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl); } } } } } /** * Page tree indexing type * * @param array Indexing Configuration Record * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call! * @param array Parameters from the log queue. * @param object Parent object (from "crawler" extension!) * @return void */ function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj) { // Base page uid: $pageUid = intval($params['url']); // Get array of URLs from page: $pageRow = t3lib_BEfunc::getRecord('pages',$pageUid); $res = $pObj->getUrlsForPageRow($pageRow); $duplicateTrack = array(); // Registry for duplicates $downloadUrls = array(); // Dummy. // Submit URLs: if (count($res)) { foreach($res as $paramSetKey => $vv) { $urlList = $pObj->urlListFromUrlArray($vv,$pageRow,time(),30,1,0,$duplicateTrack,$downloadUrls,array('tx_indexedsearch_reindex')); } } // Add subpages to log now: if ($params['depth'] < $cfgRec['depth']) { // Subpages selected $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows( 'uid,title', 'pages', 'pid = '.intval($pageUid). t3lib_BEfunc::deleteClause('pages') ); // Traverse subpages and add to queue: if (count($recs)) { foreach($recs as $r) { $this->instanceCounter++; $url = 'pages:'.$r['uid'].': '.$r['title']; $session_data['urlLog'][] = $url; // Parameters: $nparams = array( 'indexConfigUid' => $cfgRec['uid'], 'url' => $r['uid'], 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), 'depth' => $params['depth']+1 ); $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl); } } } } /** * Look up all old index configurations which are finished and needs to be reset and done * * @return void */ function cleanUpOldRunningConfigurations() { // Lookup running index configurations: $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows( 'uid,set_id', 'index_config', 'set_id!=0'.t3lib_BEfunc::deleteClause('index_config') ); // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE) foreach($runningIndexingConfigurations as $cfgRec) { // Look for ended processes: list($queued_items) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows( 'count(*) AS count', 'tx_crawler_queue', 'set_id='.intval($cfgRec['set_id']).' AND exec_time=0' ); if (!$queued_items['count']) { // Lookup old phash rows: $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows( 'phash', 'index_phash', 'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id'] ); foreach($oldPhashRows as $pHashRow) { // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php) $tableArr = explode(',','index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug'); foreach($tableArr as $table) { $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash'])); } } // End process by updating index-config record: $field_array = array ( 'set_id' => 0, 'session_data' => '', ); $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array); } } } /***************************************** * * Helper functions * *****************************************/ /** * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log. * * @param string URL string to check * @param array Array of already indexed URLs (input url is looked up here and must not exist already) * @param string Base URL of the indexing process (input URL must be "inside" the base URL!) * @return string Returls the URL if OK, otherwise false */ function checkUrl($url,$urlLog,$baseUrl) { $url = ereg_replace('\/\/$','/',$url); list($url) = explode('#',$url); if (!strstr($url,'../')) { if (t3lib_div::isFirstPartOfStr($url,$baseUrl)) { if (!in_array($url,$urlLog)) { return $url; } } } } /** * Indexing External URL * * @param string URL, http://.... * @param integer Page id to relate indexing to. * @param array Rootline array to relate indexing to * @param integer Configuration UID * @param integer Set ID value * @return array URLs found on this page */ function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) { // Load indexer if not yet. $this->loadIndexerClass(); // Index external URL: $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer'); $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl); $indexerObj->backend_setFreeIndexUid($cfgUid, $setId); $indexerObj->hash['phash'] = -1; // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!) $indexerObj->indexExternalUrl($url); $url_qParts = parse_url($url); // Get URLs on this page: $subUrls = array(); $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content); // Traverse links: foreach ($list as $count => $linkInfo) { // Decode entities: $subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']); $qParts = parse_url($subUrl); if (!$qParts['scheme']) { $subUrl = $url_qParts['scheme'].'://'.$url_qParts['host'].'/'.t3lib_div::resolveBackPath($subUrl); } $subUrls[] = $subUrl; } return $subUrls; } /** * Indexing Single Record * * @param array Record to index * @param array Configuration Record * @param array Rootline array to relate indexing to * @return void */ function indexSingleRecord($r,$cfgRec,$rl=NULL) { // Load indexer if not yet. $this->loadIndexerClass();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -