⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 indexer.h

📁 网页抓取程序
💻 H
📖 第 1 页 / 共 2 页
字号:
				cont=0;								while(ahList[cont].htmlChar && bAscFound==0 )				{					if( strcmp( ahList[cont].htmlChar, ssaa ) == 0 )					{						if(ahList[cont].type==1)	/*ascii*/						{							if(y+strlen(ahList[cont].rep)<maxout)							{								strcat(out,ahList[cont].rep);								y+=strlen(ahList[cont].rep);								bAscFound=1;							}							else								return 0;						}						else						/*UTF8*/						{							if(y+11+strlen(ahList[cont].rep)+14<maxout)							{								strcat(out,"', CONVERT(");			/* 11           + */								strcat(out,ahList[cont].rep);		/* strlen(utf8) + */								strcat(out," using UTF8),'");		/* 14           = */								y+=11+strlen(ahList[cont].rep)+14;								bAscFound=1;							}							else								return 0;						}						x+=strlen(ssaa)+1;											}	/*if( strcmp( ahList[cont].htmlChar, ssaa ) == 0 )*/										cont++;									}	/*while*/							}	/*if(aass && aass-(text+x) < 10)*/						if(bAscFound==0)				out[y++]='&';					}		else            out[y++]=curC;			}		return 1;}/*****************************************************************************************//* OOI: ows own index *//*flag = 0 = index only host.Pageflag = 1 = index all pages from the domain pointed by host.Hostflag = 2 = index all un-indexed pages*/int BuildOwsOwnIndex(struct sHost* host, unsigned int flag){	MYSQL_RES gRes;	MYSQL_RES** tmpRes=NULL;	MYSQL_ROW row;	char* sqlQuery;	OOI_NODE* lexicon;	unsigned int res_elements, counter = 0;		tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));	sqlQuery = malloc(MAXQUERYSIZE);	lexicon_number_of_elements = 0;	lexicon_actual_size = LEXICONWORDSIZE;	lexicon = InitLexicon();	switch(flag)	{		case 0:			sprintf(sqlQuery,"DELETE ii FROM pagelist, ii WHERE pagelist.hostname =\'%s\' AND pagelist.page =\'%s\' AND ii.pageid = pagelist.id ",host->Host, host->Page);			my_mysql_query(&gMysqlDB2, sqlQuery,NO_BLOCK);			snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT id, text FROM pagelist WHERE hostname = \'%s\' and page = \'%s\' ",host->Host, host->Page);		break;		case 1:			sprintf(sqlQuery,"DELETE ii FROM pagelist, ii WHERE pagelist.hostname =\'%s\' AND ii.pageid = pagelist.id ",host->Host);			my_mysql_query(&gMysqlDB2, sqlQuery,NO_BLOCK);			snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT id, text FROM pagelist WHERE hostname =\'%s\' ",host->Host);		break;		case 2:			snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT pagelist.id, pagelist.text FROM pagelist LEFT OUTER JOIN view_ii on pagelist.id = view_ii.pageid where view_ii.pageid is NULL ");		break;	}			my_mysql_query_and_store_results(&gMysqlDB2,sqlQuery,tmpRes,&gRes,NO_BLOCK);		FREE(sqlQuery);    res_elements = mysql_affected_rows(&gMysqlDB2);		if(flag > 0)		printf("Building OpenWebSpider Own Index (0 docs of %i)...          ", res_elements);	fflush(stdout);			while( (row = mysql_fetch_row(&gRes)) )	{		IndexPage2((char*)row[1], atoi(row[0]),&lexicon);		counter++;		if(counter % 10 == 0 || counter == res_elements)			printf("\rBuilding OpenWebSpider Own Index (%i docs of %i)...          ", counter, res_elements);		/* every OWSINDEXMAXSWAPDELAY pages swap the index to te DB and reinit the structures */		if(counter % OWSINDEXMAXSWAPDELAY == 0)		{			StoreOwsIndex(lexicon);			FreeOwsIndex(lexicon);			/* reinit all */			lexicon_number_of_elements = 0;			lexicon_actual_size = LEXICONWORDSIZE;			lexicon = InitLexicon();			printf("\rBuilding OpenWebSpider Own Index (%i docs of %i)...          ", counter, res_elements);		}	}		if(*tmpRes)	{		mysql_free_result(*tmpRes);	}		FREE(tmpRes);	printf("\r\n");	StoreOwsIndex(lexicon);	FreeOwsIndex(lexicon);	printf("\r\n");	return 1;}int IndexPage2(char* text, unsigned int page_id, OOI_NODE** lexicon){	char* pCh = NULL;	unsigned int wordLen;	unsigned int position = 0;		/* step 1: we split all tokens */	pCh = strtok (text,INDEXERTOKENS);		if(pCh==NULL || pCh[0]==0)		return 0;		while(pCh != NULL)	{		wordLen = strlen(pCh);		if(wordLen>OWSINDEXMINWORDSIZE && wordLen<OWSINDEXMAXWORDSIZE)		{			if(ndzLookForWord(*lexicon,_strupr(pCh))==-1)	//Add unique word				lstAddWord(lexicon,pCh);						UpdateInvertedIndex(*lexicon, pCh,page_id, position);						position ++ ;		}		pCh = strtok (NULL, INDEXERTOKENS);	}		return 1;}void UpdateInvertedIndex(OOI_NODE* lexicon, char* word, unsigned int doc_id, unsigned int position){	int pos;	INVERTED_INDEX* ii;	INVERTED_INDEX* last;		pos = ndzLookForWord(lexicon, word);		if(pos==-1)		return;		ii = lexicon[pos].ii;		last = ii->last;		last->next = malloc(sizeof(INVERTED_INDEX));	last->next->doc_id = doc_id;	last->next->position = position;	last->next->next = NULL;		ii->last = last->next;}/* GetWordId*  if the page exists returns its id*  else returns 0*/int GetWordId(char* word){	char* sqlQuery;	MYSQL_RES gRes;	MYSQL_RES** tmpRes=NULL;	MYSQL_ROW row;	unsigned int ret;		tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));		sqlQuery = malloc(MAXQUERYSIZE);		if(tmpRes==NULL || sqlQuery==NULL)		MemoryCorruptedHandler("GetWordId");		snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT id FROM %s.wordlist WHERE word='%s' LIMIT 1", DB2, word);		my_mysql_query_and_store_results(&gMysqlDB2, sqlQuery,tmpRes,&gRes,BLOCKINDEX);		FREE(sqlQuery);		row = mysql_fetch_row(&gRes);		if(row)		ret = atoi(row[0]);	else		ret = 0;			if(*tmpRes)	{		mysql_free_result(*tmpRes);	}		FREE(tmpRes);			return ret;}void StoreOwsIndex(OOI_NODE* lexicon){	MYCSTR AsqlQuery;	unsigned int i;	INVERTED_INDEX* ii;	char* sqlQuery;	unsigned int word_id;	char strTmp[50];	unsigned int isFirst;	/*init*/	AsqlQuery.myString=NULL;	sqlQuery = malloc(MAXQUERYSIZE);	my_mysql_ping(&gMysqlDB2,BLOCKINDEX);		for(i=0;i<lexicon_number_of_elements;i++)	{		//printf("\n%i -- %s\n",lexicon[i].id, lexicon[i].field);		if( (i+1) % 500 == 0 || i == lexicon_number_of_elements-1)			printf("\rStoring OpenWebSpider Index to the DB(%i words of %i)...          ",i+1 , lexicon_number_of_elements);				//Add word (the table has an unique index on the field word)		snprintf(sqlQuery,MAXQUERYSIZE,"INSERT INTO %s.wordlist (word) VALUES('%s')", DB2, lexicon[i].field);				my_mysql_query(&gMysqlDB2, sqlQuery, NO_BLOCK);				/* *** */				word_id = GetWordId(lexicon[i].field);				/* is the word in the DB? */		if(word_id > 0)		{			myCStrCpy(&AsqlQuery, "INSERT INTO ");			myCStrCat(&AsqlQuery, DB2);			myCStrCat(&AsqlQuery, ".ii (wordid, pageid, position) VALUES");			isFirst = 1;			ii = lexicon[i].ii;			if(lexicon[i].ii && lexicon[i].ii->last && lexicon[i].ii->last != lexicon[i].ii)			{				while(ii != NULL)				{					if(ii->doc_id>0)					{						if(isFirst)						{							snprintf(strTmp,50,"(%i,%i,%i)",word_id, ii->doc_id, ii->position );							isFirst=0;						}						else							snprintf(strTmp,50,",(%i,%i,%i)",word_id, ii->doc_id, ii->position );						myCStrCat(&AsqlQuery, strTmp);					}					ii = ii->next;				}				if(my_mysql_query(&gMysqlDB2, AsqlQuery.myString, NO_BLOCK))				{					ERROR_LOG(mysql_error(&gMysqlDB2))					ERROR_LOG(AsqlQuery.myString)					printf("\r\nQuery Error in function StoreOwsIndex(): %s\r\n",mysql_error(&gMysqlDB2));				}			}				}	}		FREE(sqlQuery);	FREE(AsqlQuery.myString);}#endif/*EOF*/

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -