📄 indexer.h
字号:
cont=0; while(ahList[cont].htmlChar && bAscFound==0 ) { if( strcmp( ahList[cont].htmlChar, ssaa ) == 0 ) { if(ahList[cont].type==1) /*ascii*/ { if(y+strlen(ahList[cont].rep)<maxout) { strcat(out,ahList[cont].rep); y+=strlen(ahList[cont].rep); bAscFound=1; } else return 0; } else /*UTF8*/ { if(y+11+strlen(ahList[cont].rep)+14<maxout) { strcat(out,"', CONVERT("); /* 11 + */ strcat(out,ahList[cont].rep); /* strlen(utf8) + */ strcat(out," using UTF8),'"); /* 14 = */ y+=11+strlen(ahList[cont].rep)+14; bAscFound=1; } else return 0; } x+=strlen(ssaa)+1; } /*if( strcmp( ahList[cont].htmlChar, ssaa ) == 0 )*/ cont++; } /*while*/ } /*if(aass && aass-(text+x) < 10)*/ if(bAscFound==0) out[y++]='&'; } else out[y++]=curC; } return 1;}/*****************************************************************************************//* OOI: ows own index *//*flag = 0 = index only host.Pageflag = 1 = index all pages from the domain pointed by host.Hostflag = 2 = index all un-indexed pages*/int BuildOwsOwnIndex(struct sHost* host, unsigned int flag){ MYSQL_RES gRes; MYSQL_RES** tmpRes=NULL; MYSQL_ROW row; char* sqlQuery; OOI_NODE* lexicon; unsigned int res_elements, counter = 0; tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES)); sqlQuery = malloc(MAXQUERYSIZE); lexicon_number_of_elements = 0; lexicon_actual_size = LEXICONWORDSIZE; lexicon = InitLexicon(); switch(flag) { case 0: sprintf(sqlQuery,"DELETE ii FROM pagelist, ii WHERE pagelist.hostname =\'%s\' AND pagelist.page =\'%s\' AND ii.pageid = pagelist.id ",host->Host, host->Page); my_mysql_query(&gMysqlDB2, sqlQuery,NO_BLOCK); snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT id, text FROM pagelist WHERE hostname = \'%s\' and page = \'%s\' ",host->Host, host->Page); break; case 1: sprintf(sqlQuery,"DELETE ii FROM pagelist, ii WHERE pagelist.hostname =\'%s\' AND ii.pageid = pagelist.id ",host->Host); my_mysql_query(&gMysqlDB2, sqlQuery,NO_BLOCK); snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT id, text FROM pagelist WHERE hostname =\'%s\' ",host->Host); break; case 2: snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT pagelist.id, pagelist.text FROM pagelist LEFT OUTER JOIN view_ii on pagelist.id = view_ii.pageid where view_ii.pageid is NULL "); break; } my_mysql_query_and_store_results(&gMysqlDB2,sqlQuery,tmpRes,&gRes,NO_BLOCK); FREE(sqlQuery); res_elements = mysql_affected_rows(&gMysqlDB2); if(flag > 0) printf("Building OpenWebSpider Own Index (0 docs of %i)... ", res_elements); fflush(stdout); while( (row = mysql_fetch_row(&gRes)) ) { IndexPage2((char*)row[1], atoi(row[0]),&lexicon); counter++; if(counter % 10 == 0 || counter == res_elements) printf("\rBuilding OpenWebSpider Own Index (%i docs of %i)... ", counter, res_elements); /* every OWSINDEXMAXSWAPDELAY pages swap the index to te DB and reinit the structures */ if(counter % OWSINDEXMAXSWAPDELAY == 0) { StoreOwsIndex(lexicon); FreeOwsIndex(lexicon); /* reinit all */ lexicon_number_of_elements = 0; lexicon_actual_size = LEXICONWORDSIZE; lexicon = InitLexicon(); printf("\rBuilding OpenWebSpider Own Index (%i docs of %i)... ", counter, res_elements); } } if(*tmpRes) { mysql_free_result(*tmpRes); } FREE(tmpRes); printf("\r\n"); StoreOwsIndex(lexicon); FreeOwsIndex(lexicon); printf("\r\n"); return 1;}int IndexPage2(char* text, unsigned int page_id, OOI_NODE** lexicon){ char* pCh = NULL; unsigned int wordLen; unsigned int position = 0; /* step 1: we split all tokens */ pCh = strtok (text,INDEXERTOKENS); if(pCh==NULL || pCh[0]==0) return 0; while(pCh != NULL) { wordLen = strlen(pCh); if(wordLen>OWSINDEXMINWORDSIZE && wordLen<OWSINDEXMAXWORDSIZE) { if(ndzLookForWord(*lexicon,_strupr(pCh))==-1) //Add unique word lstAddWord(lexicon,pCh); UpdateInvertedIndex(*lexicon, pCh,page_id, position); position ++ ; } pCh = strtok (NULL, INDEXERTOKENS); } return 1;}void UpdateInvertedIndex(OOI_NODE* lexicon, char* word, unsigned int doc_id, unsigned int position){ int pos; INVERTED_INDEX* ii; INVERTED_INDEX* last; pos = ndzLookForWord(lexicon, word); if(pos==-1) return; ii = lexicon[pos].ii; last = ii->last; last->next = malloc(sizeof(INVERTED_INDEX)); last->next->doc_id = doc_id; last->next->position = position; last->next->next = NULL; ii->last = last->next;}/* GetWordId* if the page exists returns its id* else returns 0*/int GetWordId(char* word){ char* sqlQuery; MYSQL_RES gRes; MYSQL_RES** tmpRes=NULL; MYSQL_ROW row; unsigned int ret; tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES)); sqlQuery = malloc(MAXQUERYSIZE); if(tmpRes==NULL || sqlQuery==NULL) MemoryCorruptedHandler("GetWordId"); snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT id FROM %s.wordlist WHERE word='%s' LIMIT 1", DB2, word); my_mysql_query_and_store_results(&gMysqlDB2, sqlQuery,tmpRes,&gRes,BLOCKINDEX); FREE(sqlQuery); row = mysql_fetch_row(&gRes); if(row) ret = atoi(row[0]); else ret = 0; if(*tmpRes) { mysql_free_result(*tmpRes); } FREE(tmpRes); return ret;}void StoreOwsIndex(OOI_NODE* lexicon){ MYCSTR AsqlQuery; unsigned int i; INVERTED_INDEX* ii; char* sqlQuery; unsigned int word_id; char strTmp[50]; unsigned int isFirst; /*init*/ AsqlQuery.myString=NULL; sqlQuery = malloc(MAXQUERYSIZE); my_mysql_ping(&gMysqlDB2,BLOCKINDEX); for(i=0;i<lexicon_number_of_elements;i++) { //printf("\n%i -- %s\n",lexicon[i].id, lexicon[i].field); if( (i+1) % 500 == 0 || i == lexicon_number_of_elements-1) printf("\rStoring OpenWebSpider Index to the DB(%i words of %i)... ",i+1 , lexicon_number_of_elements); //Add word (the table has an unique index on the field word) snprintf(sqlQuery,MAXQUERYSIZE,"INSERT INTO %s.wordlist (word) VALUES('%s')", DB2, lexicon[i].field); my_mysql_query(&gMysqlDB2, sqlQuery, NO_BLOCK); /* *** */ word_id = GetWordId(lexicon[i].field); /* is the word in the DB? */ if(word_id > 0) { myCStrCpy(&AsqlQuery, "INSERT INTO "); myCStrCat(&AsqlQuery, DB2); myCStrCat(&AsqlQuery, ".ii (wordid, pageid, position) VALUES"); isFirst = 1; ii = lexicon[i].ii; if(lexicon[i].ii && lexicon[i].ii->last && lexicon[i].ii->last != lexicon[i].ii) { while(ii != NULL) { if(ii->doc_id>0) { if(isFirst) { snprintf(strTmp,50,"(%i,%i,%i)",word_id, ii->doc_id, ii->position ); isFirst=0; } else snprintf(strTmp,50,",(%i,%i,%i)",word_id, ii->doc_id, ii->position ); myCStrCat(&AsqlQuery, strTmp); } ii = ii->next; } if(my_mysql_query(&gMysqlDB2, AsqlQuery.myString, NO_BLOCK)) { ERROR_LOG(mysql_error(&gMysqlDB2)) ERROR_LOG(AsqlQuery.myString) printf("\r\nQuery Error in function StoreOwsIndex(): %s\r\n",mysql_error(&gMysqlDB2)); } } } } FREE(sqlQuery); FREE(AsqlQuery.myString);}#endif/*EOF*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -