📄 class.indexer.php
字号:
<?php/**************************************************************** Copyright notice** (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)* All rights reserved** This script is part of the TYPO3 project. The TYPO3 project is* free software; you can redistribute it and/or modify* it under the terms of the GNU General Public License as published by* the Free Software Foundation; either version 2 of the License, or* (at your option) any later version.** The GNU General Public License can be found at* http://www.gnu.org/copyleft/gpl.html.* A copy is found in the textfile GPL.txt and important notices to the license* from the author is found in LICENSE.txt distributed with these scripts.*** This script is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the* GNU General Public License for more details.** This copyright notice MUST APPEAR in all copies of the script!***************************************************************//** * This class is a search indexer for TYPO3 * * @author Kasper Sk錼h鴍 <kasperYYYY@typo3.com> * Originally Christian Jul Jensen <christian@jul.net> helped as well. *//** * [CLASS/FUNCTION INDEX of SCRIPT] * * * * 141: class tx_indexedsearch_indexer * 207: function hook_indexContent(&$pObj) * * SECTION: Backend API * 308: function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) * 347: function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) * 365: function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) * * SECTION: Initialization * 416: function init() * 468: function initializeExternalParsers() * * SECTION: Indexing; TYPO3 pages (HTML content) * 509: function indexTypo3PageContent() * 596: function splitHTMLContent($content) * 642: function getHTMLcharset($content) * 657: function convertHTMLToUtf8($content,$charset='') * 685: function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) * 712: function typoSearchTags(&$body) * 741: function extractLinks($content) * 812: function extractHyperLinks($string) * * SECTION: Indexing; external URL * 871: function indexExternalUrl($externalUrl) * 902: function getUrlHeaders($url) * * SECTION: Indexing; external files (PDF, DOC, etc) * 948: function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') * 1054: function readFileContent($ext,$absFile,$cPKey) * 1071: function fileContentParts($ext,$absFile) * 1089: function splitRegularContent($content) * * SECTION: Analysing content, Extracting words * 1122: function charsetEntity2utf8(&$contentArr, $charset) * 1145: function processWordsInArrays($contentArr) * 1170: function procesWordsInArrays($contentArr) * 1180: function bodyDescription($contentArr) * 1202: function indexAnalyze($content) * 1223: function analyzeHeaderinfo(&$retArr,$content,$key,$offset) * 1242: function analyzeBody(&$retArr,$content) * 1262: function metaphone($word,$retRaw=FALSE) * * SECTION: SQL; TYPO3 Pages * 1304: function submitPage() * 1378: function submit_grlist($hash,$phash_x) * 1398: function submit_section($hash,$hash_t3) * 1416: function removeOldIndexedPages($phash) * * SECTION: SQL; External media * 1459: function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) * 1525: function submitFile_grlist($hash) * 1539: function submitFile_section($hash) * 1553: function removeOldIndexedFiles($phash) * * SECTION: SQL Helper functions * 1589: function checkMtimeTstamp($mtime,$phash) * 1625: function checkContentHash() * 1642: function checkExternalDocContentHash($hashGr,$content_md5h) * 1656: function is_grlist_set($phash_x) * 1669: function update_grlist($phash,$phash_x) * 1684: function updateTstamp($phash,$mtime=0) * 1699: function updateSetId($phash) * 1714: function updateParsetime($phash,$parsetime) * 1727: function updateRootline() * 1742: function getRootLineFields(&$fieldArr) * 1761: function removeLoginpagesWithContentHash() * 1778: function includeCrawlerClass() * * SECTION: SQL; Submitting words * 1805: function checkWordList($wl) * 1842: function submitWords($wl,$phash) * 1866: function freqMap($freq) * * SECTION: Hashing * 1899: function setT3Hashes() * 1925: function setExtHashes($file,$subinfo=array()) * 1949: function md5inthash($str) * 1959: function makeCHash($paramArray) * * SECTION: Internal logging functions * 1991: function log_push($msg,$key) * 2000: function log_pull() * 2011: function log_setTSlogMessage($msg, $errorNum=0) * * SECTION: tslib_fe hooks: * 2036: function fe_headerNoCache(&$params, $ref) * * TOTAL FUNCTIONS: 59 * (This index is automatically created/updated by the extension "extdeveval") * */require_once(PATH_t3lib.'class.t3lib_parsehtml.php');/** * Indexing class for TYPO3 frontend * * @author Kasper Skaarhoj <kasperYYYY@typo3.com> * @package TYPO3 * @subpackage tx_indexedsearch */class tx_indexedsearch_indexer { // Messages: var $reasons = array( -1 => 'mtime matched the document, so no changes detected and no content updated', -2 => 'The minimum age was not exceeded', 1 => "The configured max-age was exceeded for the document and thus it's indexed.", 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.', 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.', 4 => 'Page has never been indexed (is not represented in the index_phash table).' ); // HTML code blocks to exclude from indexing: var $excludeSections = 'script,style'; // Supported Extensions for external files: var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods. // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!) var $defaultGrList = '0,-1'; // Min/Max times: var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded. var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime. var $maxExternalFiles = 0; // Max number of external files to index. var $forceIndexing = FALSE; // If true, indexing is forced despite of hashes etc. var $crawlerActive = FALSE; // Set when crawler is detected (internal) // INTERNALS: var $defaultContentArray=array( 'title' => '', 'description' => '', 'keywords' => '', 'body' => '', ); var $wordcount = 0; var $externalFileCounter = 0; var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning) var $indexerConfig = array(); // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'] var $hash = array(); // Hash array, contains phash and phash_grouping var $file_phash_arr = array(); // Hash array for files var $contentParts = array(); // Content of TYPO3 page var $content_md5h = ''; var $internal_log = array(); // Internal log var $indexExternalUrl_content = ''; var $cHashParams = array(); // cHashparams array var $freqRange = 32000; var $freqMax = 0.1; // Objects: var $csObj; // Charset class object , t3lib_cs var $metaphoneObj; // Metaphone object, if any var $lexerObj; // Lexer object for word splitting /** * Parent Object (TSFE) Initialization * * @param object Parent Object (frontend TSFE object), passed by reference * @return void */ function hook_indexContent(&$pObj) { // Indexer configuration from Extension Manager interface: $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']); // Crawler activation: // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction: if (t3lib_extMgm::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) { // Setting simple log message: $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled'; // Setting variables: $this->crawlerActive = TRUE; // Crawler active flag $this->forceIndexing = TRUE; // Force indexing despite timestamps etc. } // Determine if page should be indexed, and if so, configure and initialize indexer if ($pObj->config['config']['index_enable']) { $this->log_push('Index page',''); if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) { if (!$pObj->page['no_search']) { if (!$pObj->no_cache) { if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content)) { // Setting up internal configuration from config array: $this->conf = array(); // Information about page for which the indexing takes place $this->conf['id'] = $pObj->id; // Page id $this->conf['type'] = $pObj->type; // Page type $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing. $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points) $this->conf['gr_list'] = $pObj->gr_list; // Group list $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has. // Root line uids $this->conf['rootline_uids'] = array(); foreach($pObj->config['rootLine'] as $rlkey => $rldat) { $this->conf['rootline_uids'][$rlkey] = $rldat['uid']; } // Content of page: $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page) $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing) $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed. // Configuration of behavior: $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible) $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200) // Set to zero: $this->conf['recordUid'] = 0; $this->conf['freeIndexUid'] = 0; $this->conf['freeIndexSetId'] = 0; // Init and start indexing: $this->init(); $this->indexTypo3PageContent(); } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.'); } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.'); } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!'); } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.'); $this->log_pull(); } } /**************************** * * Backend API * ****************************/ /** * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached) * * @param integer The page uid, &id= * @param integer The page type, &type= * @param integer sys_language uid, typically &L= * @param string The MP variable (Mount Points), &MP= * @param array Rootline array of only UIDs. * @param array Array of GET variables to register with this indexing * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend! * @return void */ function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) { // Setting up internal configuration from config array: $this->conf = array(); // Information about page for which the indexing takes place $this->conf['id'] = $id; // Page id (integer) $this->conf['type'] = $type; // Page type (integer) $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer) $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string) $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...) // cHash values: $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : ''; // cHash string for additional parameters $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters // Set to defaults $this->conf['freeIndexUid'] = 0; $this->conf['freeIndexSetId'] = 0; $this->conf['page_cache_reg1'] = ''; // Root line uids $this->conf['rootline_uids'] = $uidRL; // Configuration of behavior: $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible) $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200) // Init and start indexing: $this->init(); } /** * Sets the free-index uid. Can be called right after backend_initIndexer() * * @param integer Free index UID * @param integer Set id - an integer identifying the "set" of indexing operations. * @return void */ function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) { $this->conf['freeIndexUid'] = $freeIndexUid; $this->conf['freeIndexSetId'] = $freeIndexSetId; } /** * Indexing records as the content of a TYPO3 page. * * @param string Title equivalent * @param string Keywords equivalent * @param string Description equivalent
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -