📄 class.indexer.php
字号:
if (is_object($this->external_parsers[$ext])) { $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey); } return $contentArr; } /** * Creates an array with pointers to divisions of document. * * @param string File extension * @param string Absolute filename (must exist and be validated OK before calling function) * @return array Array of pointers to sections that the document should be divided into */ function fileContentParts($ext,$absFile) { $cParts = array(0); // Consult relevant external document parser: if (is_object($this->external_parsers[$ext])) { $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile); } return $cParts; } /** * Splits non-HTML content (from external files for instance) * * @param string Input content (non-HTML) to index. * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty) * @see splitHTMLContent() */ function splitRegularContent($content) { $contentArr = $this->defaultContentArray; $contentArr['body'] = $content; return $contentArr; } /********************************** * * Analysing content, Extracting words * **********************************/ /** * Convert character set and HTML entities in the value of input content array keys * * @param array Standard content array * @param string Charset of the input content (converted to utf-8) * @return void */ function charsetEntity2utf8(&$contentArr, $charset) { // Convert charset if necessary reset($contentArr); while(list($key,)=each($contentArr)) { if (strlen($contentArr[$key])) { if ($charset!=='utf-8') { $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset); } // decode all numeric / html-entities in the string to real characters: $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE); } } } /** * Processing words in the array from split*Content -functions * * @param array Array of content to index, see splitHTMLContent() and splitRegularContent() * @return array Content input array modified so each key is not a unique array of words */ function processWordsInArrays($contentArr) { // split all parts to words reset($contentArr); while(list($key,)=each($contentArr)) { $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]); } // For title, keywords, and description we don't want duplicates: $contentArr['title'] = array_unique($contentArr['title']); $contentArr['keywords'] = array_unique($contentArr['keywords']); $contentArr['description'] = array_unique($contentArr['description']); // Return modified array: return $contentArr; } /** * Processing words in the array from split*Content -functions * This function is only a wrapper because the function has been removed (see above). * * @param array Array of content to index, see splitHTMLContent() and splitRegularContent() * @return array Content input array modified so each key is not a unique array of words * @deprecated */ function procesWordsInArrays($contentArr) { return $this->processWordsInArrays($contentArr); } /** * Extracts the sample description text from the content array. * * @param array Content array * @return string Description string */ function bodyDescription($contentArr) { // Setting description $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200); if ($maxL) { // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet. # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4))); $bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']); // Shorten the string: $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL); } return $bodyDescription; } /** * Analyzes content to use for indexing, * * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words. * @return array Index Array (whatever that is...) */ function indexAnalyze($content) { $indexArr = Array(); $counter = 0; $this->analyzeHeaderinfo($indexArr,$content,'title',7); $this->analyzeHeaderinfo($indexArr,$content,'keywords',6); $this->analyzeHeaderinfo($indexArr,$content,'description',5); $this->analyzeBody($indexArr,$content); return ($indexArr); } /** * Calculates relevant information for headercontent * * @param array Index array, passed by reference * @param array Standard content array * @param string Key from standard content array * @param integer Bit-wise priority to type * @return void */ function analyzeHeaderinfo(&$retArr,$content,$key,$offset) { reset($content[$key]); while(list(,$val)=each($content[$key])) { $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same. $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset); $retArr[$val]['count'] = $retArr[$val]['count']+1; $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7)); $retArr[$val]['metaphone'] = $this->metaphone($val); $this->wordcount++; } } /** * Calculates relevant information for bodycontent * * @param array Index array, passed by reference * @param array Standard content array * @return void */ function analyzeBody(&$retArr,$content) { foreach($content['body'] as $key => $val) { $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same. if(!isset($retArr[$val])) { $retArr[$val]['first'] = $key; $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7)); $retArr[$val]['metaphone'] = $this->metaphone($val); } $retArr[$val]['count'] = $retArr[$val]['count']+1; $this->wordcount++; } } /** * Creating metaphone based hash from input word * * @param string Word to convert * @param boolean If set, returns the raw metaphone value (not hashed) * @return mixed Metaphone hash integer (or raw value, string) */ function metaphone($word,$retRaw=FALSE) { if (is_object($this->metaphoneObj)) { $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']); } else { $tmp = metaphone($word); } // Return raw value? if ($retRaw) return $tmp; // Otherwise create hash and return integer if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7)); return $ret; } /******************************** * * SQL; TYPO3 Pages * *******************************/ /** * Updates db with information about the page (TYPO3 page, not external media) * * @return void */ function submitPage() { // Remove any current data for this phash: $this->removeOldIndexedPages($this->hash['phash']); // setting new phash_row $fields = array( 'phash' => $this->hash['phash'], 'phash_grouping' => $this->hash['phash_grouping'], 'cHashParams' => serialize($this->cHashParams), 'contentHash' => $this->content_md5h, 'data_page_id' => $this->conf['id'], 'data_page_reg1' => $this->conf['page_cache_reg1'], 'data_page_type' => $this->conf['type'], 'data_page_mp' => $this->conf['MP'], 'gr_list' => $this->conf['gr_list'], 'item_type' => 0, // TYPO3 page 'item_title' => $this->contentParts['title'], 'item_description' => $this->bodyDescription($this->contentParts), 'item_mtime' => $this->conf['mtime'], 'item_size' => strlen($this->conf['content']), 'tstamp' => time(), 'crdate' => time(), 'item_crdate' => $this->conf['crdate'], // Creation date of page 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display! 'externalUrl' => 0, 'recordUid' => intval($this->conf['recordUid']), 'freeIndexUid' => intval($this->conf['freeIndexUid']), 'freeIndexSetId' => intval($this->conf['freeIndexSetId']), ); $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields); // PROCESSING index_section $this->submit_section($this->hash['phash'],$this->hash['phash']); // PROCESSING index_grlist $this->submit_grlist($this->hash['phash'],$this->hash['phash']); // PROCESSING index_fulltext $fields = array( 'phash' => $this->hash['phash'], 'fulltextdata' => implode(' ', $this->contentParts) ); if ($this->indexerConfig['fullTextDataLength']>0) { $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']); } $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields); // PROCESSING index_debug if ($this->indexerConfig['debugMode']) { $fields = array( 'phash' => $this->hash['phash'], 'debuginfo' => serialize(array( 'cHashParams' => $this->cHashParams, 'external_parsers initialized' => array_keys($this->external_parsers), 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))), 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))), 'logs' => $this->internal_log, 'lexer' => $this->lexerObj->debugString, )) ); $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields); } } /** * Stores gr_list in the database. * * @param integer Search result record phash * @param integer Actual phash of current content * @return void * @see update_grlist() */ function submit_grlist($hash,$phash_x) { // Setting the gr_list record $fields = array( 'phash' => $hash, 'phash_x' => $phash_x, 'hash_gr_list' => $this->md5inthash($this->conf['gr_list']), 'gr_list' => $this->conf['gr_list'] ); $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields); } /** * Stores section * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files. * * @param integer phash of TYPO3 parent search result record * @param integer phash of the file indexation search record * @return void */ function submit_section($hash,$hash_t3) { $fields = array( 'phash' => $hash, 'phash_t3' => $hash_t3, 'page_id' => intval($this->conf['id']) ); $this->getRootLineFields($fields); $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields); } /** * Removes records for the indexed page, $phash * * @param integer phash value to flush
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -