⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 content_rec.c

📁 C编写的用来实现search engine的推荐功能
💻 C
字号:
/**
 * content_rec.c, content based recommendation Implementation
 * zhiyong zhang
 * louisville May 23th
 **/#include <stdio.h>#include <string.h>#include <strings.h>#include <math.h>
#include "c_http.h"#include "NonbSocket2.h"#include "recdef.h"#include "getUrlTermFreqs.h"#define MAX_TERMS_PER_VECTOR 20

static char* extraHeaders = NULL;static REQUEST_METHOD_T method = M_GET;static int timeout = 100;
static int show1stLine = 0;static int showAllLine = 0;static int processResponseBody(int hSocket, char* buf, int size);

static char search_root[256] = "http://webmining.spd.louisville.edu:8081/";//static char search_root[256] = "http://136.165.45.51:8080/";static char search_params[50] = "search.jsp?query=";
static char terms_gdbm[256] = "/home/zhiyong/software/apache2/cgi-bin/terms.gdbm";static char anchors_gdbm[256] = "/home/zhiyong/software/apache2/cgi-bin/anchors.gdbm";static char form_bflag[20] = "<form";static char form_eflag[20] = "</form>";static char logo_flag[50] = "<a href=\"http://www.nutch.org/\">";

static int processResponseBody(int hSocket, char* buf, int size){	int readByte = 0;		while (1) {		int ret = Receive(hSocket, buf, size, 1);		switch (ret) {		case SOCKET_ERROR:		case SOCKET_CLOSED:			return readByte;		case SOCKET_TIMEOUT:			break;		default:			if (show1stLine) {				buf[ret] = '\0';				//writeLog(buf);				//fwrite(buf, 1, ret, stdout);				show1stLine = 0;			} else if (showAllLine) {				//fwrite(buf, 1, ret, stdout);				buf[ret] = '\0';				//writeLog(buf);			}			readByte += ret;			break;		}	}}

/** * get content-based recommendation results. **/int getContentBasedRecs(SESSION_LINKS links, char* buf){	int i,iRet;	char orig_url[256];	char cur_url[256];	char pre_url[256];	PAGE_TERMS terms1, terms2;	char term_freq[100];	char term_vector[20480];	HTTP_RESPONSE_HEADER_T headers;	char request_str[1024];	char search_query[1024];	int len;	int linkNum;	char content[20481];	char temp[20481];	char *ptr = NULL;	char *pptr = NULL;		bzero(&terms1, sizeof(terms1));	bzero(&terms2, sizeof(terms2));	bzero(term_vector, sizeof(term_vector));	bzero(cur_url, sizeof(cur_url));	bzero(pre_url, sizeof(pre_url));	strcpy(orig_url, links.urls[0].url);	iRet = getPageTerms(orig_url, &terms1, terms_gdbm);	if(iRet < 0)	{		fprintf(stderr, "failed to get page terms from %s\n", orig_url);		return -1;	}	linkNum = links.num;	for(i=1;i<linkNum;i++)	{		if(!strcmp(links.urls[i].url, links.urls[i-1].url)) 		{			//skip refreshing visits			continue;		}		//sprintf(orig_url, "%s%s", orig_root, sessionLinks[i]);		strcpy(orig_url, links.urls[i].url);		iRet = getPageTerms(orig_url, &terms2, terms_gdbm);		if(iRet < 0)		{			fprintf(stderr, "failed to get page terms from %s\n", orig_url);			return -1;		}		iRet = mergePageTerms(&terms1, &terms2);		if(iRet < 0)		{			fprintf(stderr, "merger page terms error \n");			return -1;		}			}		sprintf(term_freq, "%s_%d", terms1.terms[0].term, terms1.terms[0].freq);	strcpy(term_vector, term_freq);	for(i=1;i<terms1.termNum && i<MAX_TERMS_PER_VECTOR;i++)	{		strcat(term_vector, "_");		sprintf(term_freq, "%s_%d", terms1.terms[i].term, terms1.terms[i].freq);		strcat(term_vector, term_freq);		//if(i >= MAX_TERMS_PER_VECTOR || terms1.terms[i].freq <=1) break;	}	//printf("term vector = %s <br>", term_vector);	initResponseHeader(&headers);	headers.processResponseBody = processResponseBody;	if(linkNum>=2)	{		sprintf(cur_url, "%s", links.urls[linkNum-1].url);		sprintf(pre_url, "%s", links.urls[linkNum-2].url);		if(terms1.termNum == 0)		{//to improve speed, don't use boolean minus operation.#if MINUS_CURRENT			sprintf(search_query, "pageurl:%s+-url:%s+-url:%s", cur_url, cur_url, pre_url);#else			sprintf(search_query, "pageurl:%s", cur_url);#endif		}		else		{#if MINUS_CURRENT			sprintf(search_query, "termvec:%s+-url:%s+-url:%s", term_vector, cur_url, pre_url);#else			sprintf(search_query, "termvec:%s", term_vector);#endif		}	}	else if(linkNum>=1)	{		sprintf(cur_url, "%s", links.urls[linkNum-1].url);		if(terms1.termNum == 0)		{#if MINUS_CURRENT			sprintf(search_query, "pageurl:%s+-url:%s", cur_url, cur_url);#else			sprintf(search_query, "pageurl:%s", cur_url);#endif		}		else		{#if MINUS_CURRENT			sprintf(search_query, "termvec:%s+-url:%s", term_vector, cur_url);#else			sprintf(search_query, "termvec:%s", term_vector);#endif		}	}		sprintf(request_str, "%s%s%s", search_root, search_params, search_query);	len = HTTPRequest(request_str, method, extraHeaders, &headers, timeout, content, 20481);	if(len <= 0) return -1;	strcpy(temp, content);	//post processing content	ptr = strstr(temp, form_bflag);	if(ptr == NULL) 	{		fprintf(stderr, "Strange, can't find search begin button.\n");		goto EXIT;;	}	pptr = ptr + strlen(form_bflag);	ptr = strstr(pptr, form_eflag);	if(ptr == NULL)	{		fprintf(stderr, "Strange, can't find search end button.\n");		goto EXIT;;	}	pptr = ptr + strlen(form_eflag);	ptr = strstr(pptr, form_bflag);	if(ptr != NULL) *ptr = '\0';	else	{		ptr = strstr(pptr, logo_flag);		if(ptr == NULL)		{			fprintf(stderr, "Strange, can't find search logo.\n");			goto EXIT;		}		*ptr = '\0';	}	strcpy(buf, pptr);	return strlen(buf);EXIT:	strncpy(buf, content, 10240);	buf[10239] = '\0';	return strlen(buf);}/** * get anchor text based recommendation results. **/int getAnchorBasedRecs(SESSION_LINKS links, char* buf){	int i,iRet;	char orig_url[256];	char cur_url[256];	char pre_url[256];	PAGE_TERMS terms1, terms2;	char term_freq[100];	char term_vector[20480];	HTTP_RESPONSE_HEADER_T headers;	char request_str[1024];	char search_query[1024];	int len;	int linkNum;	char content[20481];	char temp[20481];	char *ptr = NULL;	char *pptr = NULL;		bzero(&terms1, sizeof(terms1));	bzero(&terms2, sizeof(terms2));	bzero(term_vector, sizeof(term_vector));	bzero(cur_url, sizeof(cur_url));	bzero(pre_url, sizeof(pre_url));	strcpy(orig_url, links.urls[0].url);	iRet = getPageTerms(orig_url, &terms1, anchors_gdbm);	if(iRet < 0)	{		fprintf(stderr, "failed to get anchor terms from %s\n", orig_url);		return -1;	}	linkNum = links.num;	for(i=1;i<linkNum;i++)	{		if(!strcmp(links.urls[i].url, links.urls[i-1].url)) 		{			//skip refreshing visits			continue;		}		strcpy(orig_url, links.urls[i].url);		iRet = getPageTerms(orig_url, &terms2, anchors_gdbm);		if(iRet < 0)		{			fprintf(stderr, "failed to get anchor terms from %s\n", orig_url);			return -1;		}		iRet = mergePageTerms(&terms1, &terms2);		if(iRet < 0)		{			fprintf(stderr, "merger anchor terms error \n");			return -1;		}			}	if(terms1.termNum == 0) 	{		return 0;	}	//sprintf(term_freq, "%s_%d", terms1.terms[0].term, terms1.terms[0].freq);	sprintf(term_freq, "%s", terms1.terms[0].term);	strcpy(term_vector, term_freq);	for(i=1;i<terms1.termNum && i<MAX_TERMS_PER_VECTOR && i<4;i++)	{		strcat(term_vector, "+");		//sprintf(term_freq, "%s_%d", terms1.terms[i].term, terms1.terms[i].freq);		sprintf(term_freq, "%s", terms1.terms[i].term);		strcat(term_vector, term_freq);	}	initResponseHeader(&headers);	headers.processResponseBody = processResponseBody;	if(linkNum>=2)	{		//sprintf(cur_url, "%s%s", orig_root, sessionLinks[linkNum-1]);		sprintf(cur_url, "%s", links.urls[linkNum-1].url);		sprintf(pre_url, "%s", links.urls[linkNum-2].url);#if MINUS_CURRENT		//sprintf(search_query, "termvec:%s+-url:%s+-url:%s", term_vector, cur_url, pre_url);		sprintf(search_query, "%s+-url:%s+-url:%s", term_vector, cur_url, pre_url);#else		//sprintf(search_query, "termvec:%s", term_vector);		sprintf(search_query, "%s", term_vector);#endif	}	else if(linkNum>=1)	{		sprintf(cur_url, "%s", links.urls[linkNum-1].url);#if MINUS_CURRENT		//sprintf(search_query, "termvec:%s+-url:%s", term_vector, cur_url);		sprintf(search_query, "%s+-url:%s", term_vector, cur_url);#else		//sprintf(search_query, "termvec:%s", term_vector);		sprintf(search_query, "%s", term_vector);#endif	}		sprintf(request_str, "%s%s%s", search_root, search_params, search_query);	len = HTTPRequest(request_str, method, extraHeaders, &headers, timeout, content, 20481);	if(len <= 0) return -1;	strcpy(temp, content);	//post processing content	ptr = strstr(temp, form_bflag);	if(ptr == NULL) 	{		fprintf(stderr, "Strange, can't find search begin button.\n");		goto EXIT;;	}	pptr = ptr + strlen(form_bflag);	ptr = strstr(pptr, form_eflag);	if(ptr == NULL)	{		fprintf(stderr, "Strange, can't find search end button.\n");		goto EXIT;;	}	pptr = ptr + strlen(form_eflag);	ptr = strstr(pptr, form_bflag);	if(ptr != NULL) *ptr = '\0';	else	{		ptr = strstr(pptr, logo_flag);		if(ptr == NULL)		{			fprintf(stderr, "Strange, can't find search logo.\n");			goto EXIT;		}		*ptr = '\0';	}	strcpy(buf, pptr);	return strlen(buf);EXIT:	strncpy(buf, content, 10240);	buf[10239] = '\0';	return strlen(buf);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -