⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 geturltermfreqs.c

📁 C编写的用来实现search engine的推荐功能
💻 C
字号:
/** * getUrlTermFreqs.c * program to get terms and frequencies and merge them in one session * zhiyong, zhang, March, 01, 2006 **/#include <gdbm.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <strings.h>#include <math.h>#include "getUrlTermFreqs.h"static int compareTerm(const void* term1, const void* term2);static int compareFreq(const void* freq1, const void* freq2);//static char termfreq_gdm_name[128] = "/home/zhiyong/webmining/raw/url_term_freq.gdbm";/** * get terms from by url link **/int getPageTerms(char *link, PAGE_TERMS *terms, char *gdbmFname){	int num, iRet;	GDBM_FILE db = NULL;	datum name;		datum value;		value.dptr = NULL;		char words[20480];	char *ptr = NULL;	char *pptr = NULL;	char *ptr_cur = NULL;	char termfreq[100];	char temp[256];	strncpy(temp, link, 256);	temp[255] = '\0';	temp[254] = '\0';	if(strlen(temp) <= 1) return 0;	//open the gdbm file for reading	db = gdbm_open(gdbmFname, 2048, GDBM_READER , 0400 ,0);		if(db == NULL)		{				fprintf(stderr, "cannot open the gdbm lib %s", gdbmFname);				return -1;		}	name.dptr= link;	name.dsize=strlen(link);			iRet = gdbm_exists(db, name);	if(!iRet)	{		if(temp[strlen(temp)-1] != '/')		{ //if no, add			temp[strlen(temp)] = '/';		}		else		{// if yes, remove			temp[strlen(temp) - 1] = '\0';		}		name.dptr = temp;		name.dsize = strlen(temp);		iRet = gdbm_exists(db, name);		if(!iRet)		{			gdbm_close(db);			return 0;		}	}	value = gdbm_fetch(db, name);	if(value.dsize <= 1)				{						fprintf(stderr, "gdbm fetch error\n");						goto EXIT;					}	strncpy(words, value.dptr, value.dsize);	words[value.dsize] = '\0';	if(value.dptr != NULL)	{		free(value.dptr);		value.dptr = NULL;	}	ptr = strchr(words, '\n');				if(ptr != NULL) *ptr = '\0';	//post processing	ptr_cur = words;	num = 0;	while((ptr = strchr(ptr_cur, ' ')) != NULL)	{		*ptr = '\0';		strcpy(termfreq, ptr_cur);		pptr = strchr(termfreq, ',');		if(pptr != NULL) *pptr = '\0';		strcpy(terms->terms[num].term, termfreq);		terms->terms[num++].freq = atoi(pptr+1);		ptr_cur = ptr + 1;		if(num >= MAX_TERMS_PER_PAGE/2) break;	}	strcpy(termfreq, ptr_cur);	pptr = strchr(termfreq, ',');	if(pptr != NULL) *pptr = '\0';	strcpy(terms->terms[num].term, termfreq);	terms->terms[num++].freq = atoi(pptr+1);	terms->termNum = num;	if(db != NULL)	{		gdbm_close(db);			db = NULL;	}	return num;EXIT:	if(value.dptr != NULL)	{		free(value.dptr);		value.dptr = NULL;	}	if(db != NULL)		{				gdbm_close(db);				db = NULL;		}	if(value.dsize > 0) return 0;	return -1;	}static int compareTerm(const void* term1, const void* term2){	TERM_ITEM* term_item1 = (TERM_ITEM*)term1;	TERM_ITEM* term_item2 = (TERM_ITEM*)term2;	if (strcmp(term_item1->term, term_item2->term) < 0)	{		return -1;	}	else if (strcmp(term_item1->term, term_item2->term) > 0)	{		return 1;	}	else	{		return 0;	}	}static int compareFreq(const void* freq1, const void* freq2){	TERM_ITEM* term_item1 = (TERM_ITEM*)freq1;	TERM_ITEM* term_item2 = (TERM_ITEM*)freq2;	if (term_item1->freq > term_item2->freq)	{		return -1;	}	else if (term_item1->freq < term_item2->freq)	{		return 1;	}	else	{		return 0;	}	}/** * merge page terms and output result in term frequency order **/int mergePageTerms(PAGE_TERMS *terms1, PAGE_TERMS *terms2){	int i=0,j=0;	int k=0;	int i2,j2;	PAGE_TERMS terms_merge;	if(terms1->termNum == 0)	{		*terms1 = *terms2;		return terms1->termNum;	}	bzero(&terms_merge, sizeof(terms_merge));	qsort(terms1->terms, terms1->termNum, sizeof(TERM_ITEM), compareTerm);	qsort(terms2->terms, terms2->termNum, sizeof(TERM_ITEM), compareTerm);	while(i<terms1->termNum && j<terms2->termNum)	{			if(strcmp(terms2->terms[j].term, terms1->terms[i].term) < 0)			{				//only occurs in terms2				strcpy(terms_merge.terms[k].term, terms2->terms[j].term);				terms_merge.terms[k++].freq = terms2->terms[j].freq;				terms_merge.termNum++;				j++;			}			else if(strcmp(terms2->terms[j].term, terms1->terms[i].term) > 0)			{				//only occurs in terms1				strcpy(terms_merge.terms[k].term, terms1->terms[i].term);				terms_merge.terms[k++].freq = terms1->terms[i].freq/FORGETTING_FACTOR;				terms_merge.termNum++;				i++;			}			else			{				//same term occurs in both terms1 and terms2				strcpy(terms_merge.terms[k].term, terms1->terms[i].term);				terms_merge.terms[k++].freq = terms1->terms[i].freq/FORGETTING_FACTOR + terms2->terms[j].freq;				terms_merge.termNum++;				j++;				i++;			}					if(terms_merge.termNum >= MAX_TERMS_PER_PAGE) break;	}	if(terms_merge.termNum < MAX_TERMS_PER_PAGE)	{		if(j>=terms2->termNum && i<terms1->termNum)		{			for(i2=0;i2<terms1->termNum-i;i2++)			{				strcpy(terms_merge.terms[k].term, terms1->terms[i+i2].term);				terms_merge.terms[k++].freq = terms1->terms[i+i2].freq/FORGETTING_FACTOR;				terms_merge.termNum++;			}								}		else if(i>=terms1->termNum && j<terms2->termNum)		{			for(j2=0;j2<terms2->termNum-j;j2++)			{				strcpy(terms_merge.terms[k].term, terms2->terms[j+j2].term);				terms_merge.terms[k++].freq = terms2->terms[j+j2].freq;				terms_merge.termNum++;			}		}	}	qsort(terms_merge.terms, terms_merge.termNum, sizeof(TERM_ITEM), compareFreq);	*terms1 = terms_merge;	return terms_merge.termNum;}/** * get the page similarity **/double getPageSimlarity(char *link1, char *link2, char *gdbmFname){	int i,j;	int i2,j2;	int num1, num2;	PAGE_TERMS terms1, terms2;	double term1SumSqu, term2SumSqu, dotProduct;	double sim = 0.0;	bzero(&terms1, sizeof(terms1));	bzero(&terms2, sizeof(terms2));	num1 = getPageTerms(link1, &terms1, gdbmFname);	if(num1 < 0)	{		fprintf(stderr, "getPageTerms error %s\n", link1);		return -1;	}	if(num1 == 0)	{		return -2;	}	num2 = getPageTerms(link2, &terms2, gdbmFname);	if(num2 < 0)	{		fprintf(stderr, "getPageTerms error %s\n", link2);		return -1;	}	if(num2 == 0)	{		return -2;	}	qsort(terms1.terms, terms1.termNum, sizeof(TERM_ITEM), compareTerm);	qsort(terms2.terms, terms2.termNum, sizeof(TERM_ITEM), compareTerm);	term1SumSqu = 0.0;	term2SumSqu = 0.0;	dotProduct = 0.0;	i=0,j=0;	while(i<terms1.termNum && j<terms2.termNum)	{			if(strcmp(terms2.terms[j].term, terms1.terms[i].term) < 0)			{				//only occurs in terms2				term2SumSqu += (double)terms2.terms[j].freq * terms2.terms[j].freq;				j++;			}			else if(strcmp(terms2.terms[j].term, terms1.terms[i].term) > 0)			{				//only occurs in terms1				term1SumSqu += (double)terms1.terms[i].freq * terms1.terms[i].freq;				i++;			}			else			{				//same term occurs in both terms1 and terms2				dotProduct += (double)terms1.terms[i].freq * terms2.terms[j].freq;				term1SumSqu += (double)terms1.terms[i].freq * terms1.terms[i].freq;				term2SumSqu += (double)terms2.terms[j].freq * terms2.terms[j].freq;				j++;				i++;			}		}	if(j>=terms2.termNum && i<terms1.termNum)	{		for(i2=0;i2<terms1.termNum-i;i2++)		{			term1SumSqu += (double)terms1.terms[i+i2].freq * terms1.terms[i+i2].freq;		}							}	else if(i>=terms1.termNum && j<terms2.termNum)	{		for(j2=0;j2<terms2.termNum-j;j2++)		{			term2SumSqu += (double)terms2.terms[j+j2].freq * terms2.terms[j+j2].freq;		}	}	sim = dotProduct/(sqrt(term1SumSqu) * sqrt(term2SumSqu));		return sim;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -