📄 geturltermfreqs.c
字号:
/** * getUrlTermFreqs.c * program to get terms and frequencies and merge them in one session * zhiyong, zhang, March, 01, 2006 **/#include <gdbm.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <strings.h>#include <math.h>#include "getUrlTermFreqs.h"static int compareTerm(const void* term1, const void* term2);static int compareFreq(const void* freq1, const void* freq2);//static char termfreq_gdm_name[128] = "/home/zhiyong/webmining/raw/url_term_freq.gdbm";/** * get terms from by url link **/int getPageTerms(char *link, PAGE_TERMS *terms, char *gdbmFname){ int num, iRet; GDBM_FILE db = NULL; datum name; datum value; value.dptr = NULL; char words[20480]; char *ptr = NULL; char *pptr = NULL; char *ptr_cur = NULL; char termfreq[100]; char temp[256]; strncpy(temp, link, 256); temp[255] = '\0'; temp[254] = '\0'; if(strlen(temp) <= 1) return 0; //open the gdbm file for reading db = gdbm_open(gdbmFname, 2048, GDBM_READER , 0400 ,0); if(db == NULL) { fprintf(stderr, "cannot open the gdbm lib %s", gdbmFname); return -1; } name.dptr= link; name.dsize=strlen(link); iRet = gdbm_exists(db, name); if(!iRet) { if(temp[strlen(temp)-1] != '/') { //if no, add temp[strlen(temp)] = '/'; } else {// if yes, remove temp[strlen(temp) - 1] = '\0'; } name.dptr = temp; name.dsize = strlen(temp); iRet = gdbm_exists(db, name); if(!iRet) { gdbm_close(db); return 0; } } value = gdbm_fetch(db, name); if(value.dsize <= 1) { fprintf(stderr, "gdbm fetch error\n"); goto EXIT; } strncpy(words, value.dptr, value.dsize); words[value.dsize] = '\0'; if(value.dptr != NULL) { free(value.dptr); value.dptr = NULL; } ptr = strchr(words, '\n'); if(ptr != NULL) *ptr = '\0'; //post processing ptr_cur = words; num = 0; while((ptr = strchr(ptr_cur, ' ')) != NULL) { *ptr = '\0'; strcpy(termfreq, ptr_cur); pptr = strchr(termfreq, ','); if(pptr != NULL) *pptr = '\0'; strcpy(terms->terms[num].term, termfreq); terms->terms[num++].freq = atoi(pptr+1); ptr_cur = ptr + 1; if(num >= MAX_TERMS_PER_PAGE/2) break; } strcpy(termfreq, ptr_cur); pptr = strchr(termfreq, ','); if(pptr != NULL) *pptr = '\0'; strcpy(terms->terms[num].term, termfreq); terms->terms[num++].freq = atoi(pptr+1); terms->termNum = num; if(db != NULL) { gdbm_close(db); db = NULL; } return num;EXIT: if(value.dptr != NULL) { free(value.dptr); value.dptr = NULL; } if(db != NULL) { gdbm_close(db); db = NULL; } if(value.dsize > 0) return 0; return -1; }static int compareTerm(const void* term1, const void* term2){ TERM_ITEM* term_item1 = (TERM_ITEM*)term1; TERM_ITEM* term_item2 = (TERM_ITEM*)term2; if (strcmp(term_item1->term, term_item2->term) < 0) { return -1; } else if (strcmp(term_item1->term, term_item2->term) > 0) { return 1; } else { return 0; } }static int compareFreq(const void* freq1, const void* freq2){ TERM_ITEM* term_item1 = (TERM_ITEM*)freq1; TERM_ITEM* term_item2 = (TERM_ITEM*)freq2; if (term_item1->freq > term_item2->freq) { return -1; } else if (term_item1->freq < term_item2->freq) { return 1; } else { return 0; } }/** * merge page terms and output result in term frequency order **/int mergePageTerms(PAGE_TERMS *terms1, PAGE_TERMS *terms2){ int i=0,j=0; int k=0; int i2,j2; PAGE_TERMS terms_merge; if(terms1->termNum == 0) { *terms1 = *terms2; return terms1->termNum; } bzero(&terms_merge, sizeof(terms_merge)); qsort(terms1->terms, terms1->termNum, sizeof(TERM_ITEM), compareTerm); qsort(terms2->terms, terms2->termNum, sizeof(TERM_ITEM), compareTerm); while(i<terms1->termNum && j<terms2->termNum) { if(strcmp(terms2->terms[j].term, terms1->terms[i].term) < 0) { //only occurs in terms2 strcpy(terms_merge.terms[k].term, terms2->terms[j].term); terms_merge.terms[k++].freq = terms2->terms[j].freq; terms_merge.termNum++; j++; } else if(strcmp(terms2->terms[j].term, terms1->terms[i].term) > 0) { //only occurs in terms1 strcpy(terms_merge.terms[k].term, terms1->terms[i].term); terms_merge.terms[k++].freq = terms1->terms[i].freq/FORGETTING_FACTOR; terms_merge.termNum++; i++; } else { //same term occurs in both terms1 and terms2 strcpy(terms_merge.terms[k].term, terms1->terms[i].term); terms_merge.terms[k++].freq = terms1->terms[i].freq/FORGETTING_FACTOR + terms2->terms[j].freq; terms_merge.termNum++; j++; i++; } if(terms_merge.termNum >= MAX_TERMS_PER_PAGE) break; } if(terms_merge.termNum < MAX_TERMS_PER_PAGE) { if(j>=terms2->termNum && i<terms1->termNum) { for(i2=0;i2<terms1->termNum-i;i2++) { strcpy(terms_merge.terms[k].term, terms1->terms[i+i2].term); terms_merge.terms[k++].freq = terms1->terms[i+i2].freq/FORGETTING_FACTOR; terms_merge.termNum++; } } else if(i>=terms1->termNum && j<terms2->termNum) { for(j2=0;j2<terms2->termNum-j;j2++) { strcpy(terms_merge.terms[k].term, terms2->terms[j+j2].term); terms_merge.terms[k++].freq = terms2->terms[j+j2].freq; terms_merge.termNum++; } } } qsort(terms_merge.terms, terms_merge.termNum, sizeof(TERM_ITEM), compareFreq); *terms1 = terms_merge; return terms_merge.termNum;}/** * get the page similarity **/double getPageSimlarity(char *link1, char *link2, char *gdbmFname){ int i,j; int i2,j2; int num1, num2; PAGE_TERMS terms1, terms2; double term1SumSqu, term2SumSqu, dotProduct; double sim = 0.0; bzero(&terms1, sizeof(terms1)); bzero(&terms2, sizeof(terms2)); num1 = getPageTerms(link1, &terms1, gdbmFname); if(num1 < 0) { fprintf(stderr, "getPageTerms error %s\n", link1); return -1; } if(num1 == 0) { return -2; } num2 = getPageTerms(link2, &terms2, gdbmFname); if(num2 < 0) { fprintf(stderr, "getPageTerms error %s\n", link2); return -1; } if(num2 == 0) { return -2; } qsort(terms1.terms, terms1.termNum, sizeof(TERM_ITEM), compareTerm); qsort(terms2.terms, terms2.termNum, sizeof(TERM_ITEM), compareTerm); term1SumSqu = 0.0; term2SumSqu = 0.0; dotProduct = 0.0; i=0,j=0; while(i<terms1.termNum && j<terms2.termNum) { if(strcmp(terms2.terms[j].term, terms1.terms[i].term) < 0) { //only occurs in terms2 term2SumSqu += (double)terms2.terms[j].freq * terms2.terms[j].freq; j++; } else if(strcmp(terms2.terms[j].term, terms1.terms[i].term) > 0) { //only occurs in terms1 term1SumSqu += (double)terms1.terms[i].freq * terms1.terms[i].freq; i++; } else { //same term occurs in both terms1 and terms2 dotProduct += (double)terms1.terms[i].freq * terms2.terms[j].freq; term1SumSqu += (double)terms1.terms[i].freq * terms1.terms[i].freq; term2SumSqu += (double)terms2.terms[j].freq * terms2.terms[j].freq; j++; i++; } } if(j>=terms2.termNum && i<terms1.termNum) { for(i2=0;i2<terms1.termNum-i;i2++) { term1SumSqu += (double)terms1.terms[i+i2].freq * terms1.terms[i+i2].freq; } } else if(i>=terms1.termNum && j<terms2.termNum) { for(j2=0;j2<terms2.termNum-j;j2++) { term2SumSqu += (double)terms2.terms[j+j2].freq * terms2.terms[j+j2].freq; } } sim = dotProduct/(sqrt(term1SumSqu) * sqrt(term2SumSqu)); return sim;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -