📄 content_rec.c
字号:
/**
* content_rec.c, content based recommendation Implementation
* zhiyong zhang
* louisville May 23th
**/#include <stdio.h>#include <string.h>#include <strings.h>#include <math.h>
#include "c_http.h"#include "NonbSocket2.h"#include "recdef.h"#include "getUrlTermFreqs.h"#define MAX_TERMS_PER_VECTOR 20
static char* extraHeaders = NULL;static REQUEST_METHOD_T method = M_GET;static int timeout = 100;
static int show1stLine = 0;static int showAllLine = 0;static int processResponseBody(int hSocket, char* buf, int size);
static char search_root[256] = "http://webmining.spd.louisville.edu:8081/";//static char search_root[256] = "http://136.165.45.51:8080/";static char search_params[50] = "search.jsp?query=";
static char terms_gdbm[256] = "/home/zhiyong/software/apache2/cgi-bin/terms.gdbm";static char anchors_gdbm[256] = "/home/zhiyong/software/apache2/cgi-bin/anchors.gdbm";static char form_bflag[20] = "<form";static char form_eflag[20] = "</form>";static char logo_flag[50] = "<a href=\"http://www.nutch.org/\">";
static int processResponseBody(int hSocket, char* buf, int size){ int readByte = 0; while (1) { int ret = Receive(hSocket, buf, size, 1); switch (ret) { case SOCKET_ERROR: case SOCKET_CLOSED: return readByte; case SOCKET_TIMEOUT: break; default: if (show1stLine) { buf[ret] = '\0'; //writeLog(buf); //fwrite(buf, 1, ret, stdout); show1stLine = 0; } else if (showAllLine) { //fwrite(buf, 1, ret, stdout); buf[ret] = '\0'; //writeLog(buf); } readByte += ret; break; } }}
/** * get content-based recommendation results. **/int getContentBasedRecs(SESSION_LINKS links, char* buf){ int i,iRet; char orig_url[256]; char cur_url[256]; char pre_url[256]; PAGE_TERMS terms1, terms2; char term_freq[100]; char term_vector[20480]; HTTP_RESPONSE_HEADER_T headers; char request_str[1024]; char search_query[1024]; int len; int linkNum; char content[20481]; char temp[20481]; char *ptr = NULL; char *pptr = NULL; bzero(&terms1, sizeof(terms1)); bzero(&terms2, sizeof(terms2)); bzero(term_vector, sizeof(term_vector)); bzero(cur_url, sizeof(cur_url)); bzero(pre_url, sizeof(pre_url)); strcpy(orig_url, links.urls[0].url); iRet = getPageTerms(orig_url, &terms1, terms_gdbm); if(iRet < 0) { fprintf(stderr, "failed to get page terms from %s\n", orig_url); return -1; } linkNum = links.num; for(i=1;i<linkNum;i++) { if(!strcmp(links.urls[i].url, links.urls[i-1].url)) { //skip refreshing visits continue; } //sprintf(orig_url, "%s%s", orig_root, sessionLinks[i]); strcpy(orig_url, links.urls[i].url); iRet = getPageTerms(orig_url, &terms2, terms_gdbm); if(iRet < 0) { fprintf(stderr, "failed to get page terms from %s\n", orig_url); return -1; } iRet = mergePageTerms(&terms1, &terms2); if(iRet < 0) { fprintf(stderr, "merger page terms error \n"); return -1; } } sprintf(term_freq, "%s_%d", terms1.terms[0].term, terms1.terms[0].freq); strcpy(term_vector, term_freq); for(i=1;i<terms1.termNum && i<MAX_TERMS_PER_VECTOR;i++) { strcat(term_vector, "_"); sprintf(term_freq, "%s_%d", terms1.terms[i].term, terms1.terms[i].freq); strcat(term_vector, term_freq); //if(i >= MAX_TERMS_PER_VECTOR || terms1.terms[i].freq <=1) break; } //printf("term vector = %s <br>", term_vector); initResponseHeader(&headers); headers.processResponseBody = processResponseBody; if(linkNum>=2) { sprintf(cur_url, "%s", links.urls[linkNum-1].url); sprintf(pre_url, "%s", links.urls[linkNum-2].url); if(terms1.termNum == 0) {//to improve speed, don't use boolean minus operation.#if MINUS_CURRENT sprintf(search_query, "pageurl:%s+-url:%s+-url:%s", cur_url, cur_url, pre_url);#else sprintf(search_query, "pageurl:%s", cur_url);#endif } else {#if MINUS_CURRENT sprintf(search_query, "termvec:%s+-url:%s+-url:%s", term_vector, cur_url, pre_url);#else sprintf(search_query, "termvec:%s", term_vector);#endif } } else if(linkNum>=1) { sprintf(cur_url, "%s", links.urls[linkNum-1].url); if(terms1.termNum == 0) {#if MINUS_CURRENT sprintf(search_query, "pageurl:%s+-url:%s", cur_url, cur_url);#else sprintf(search_query, "pageurl:%s", cur_url);#endif } else {#if MINUS_CURRENT sprintf(search_query, "termvec:%s+-url:%s", term_vector, cur_url);#else sprintf(search_query, "termvec:%s", term_vector);#endif } } sprintf(request_str, "%s%s%s", search_root, search_params, search_query); len = HTTPRequest(request_str, method, extraHeaders, &headers, timeout, content, 20481); if(len <= 0) return -1; strcpy(temp, content); //post processing content ptr = strstr(temp, form_bflag); if(ptr == NULL) { fprintf(stderr, "Strange, can't find search begin button.\n"); goto EXIT;; } pptr = ptr + strlen(form_bflag); ptr = strstr(pptr, form_eflag); if(ptr == NULL) { fprintf(stderr, "Strange, can't find search end button.\n"); goto EXIT;; } pptr = ptr + strlen(form_eflag); ptr = strstr(pptr, form_bflag); if(ptr != NULL) *ptr = '\0'; else { ptr = strstr(pptr, logo_flag); if(ptr == NULL) { fprintf(stderr, "Strange, can't find search logo.\n"); goto EXIT; } *ptr = '\0'; } strcpy(buf, pptr); return strlen(buf);EXIT: strncpy(buf, content, 10240); buf[10239] = '\0'; return strlen(buf);}/** * get anchor text based recommendation results. **/int getAnchorBasedRecs(SESSION_LINKS links, char* buf){ int i,iRet; char orig_url[256]; char cur_url[256]; char pre_url[256]; PAGE_TERMS terms1, terms2; char term_freq[100]; char term_vector[20480]; HTTP_RESPONSE_HEADER_T headers; char request_str[1024]; char search_query[1024]; int len; int linkNum; char content[20481]; char temp[20481]; char *ptr = NULL; char *pptr = NULL; bzero(&terms1, sizeof(terms1)); bzero(&terms2, sizeof(terms2)); bzero(term_vector, sizeof(term_vector)); bzero(cur_url, sizeof(cur_url)); bzero(pre_url, sizeof(pre_url)); strcpy(orig_url, links.urls[0].url); iRet = getPageTerms(orig_url, &terms1, anchors_gdbm); if(iRet < 0) { fprintf(stderr, "failed to get anchor terms from %s\n", orig_url); return -1; } linkNum = links.num; for(i=1;i<linkNum;i++) { if(!strcmp(links.urls[i].url, links.urls[i-1].url)) { //skip refreshing visits continue; } strcpy(orig_url, links.urls[i].url); iRet = getPageTerms(orig_url, &terms2, anchors_gdbm); if(iRet < 0) { fprintf(stderr, "failed to get anchor terms from %s\n", orig_url); return -1; } iRet = mergePageTerms(&terms1, &terms2); if(iRet < 0) { fprintf(stderr, "merger anchor terms error \n"); return -1; } } if(terms1.termNum == 0) { return 0; } //sprintf(term_freq, "%s_%d", terms1.terms[0].term, terms1.terms[0].freq); sprintf(term_freq, "%s", terms1.terms[0].term); strcpy(term_vector, term_freq); for(i=1;i<terms1.termNum && i<MAX_TERMS_PER_VECTOR && i<4;i++) { strcat(term_vector, "+"); //sprintf(term_freq, "%s_%d", terms1.terms[i].term, terms1.terms[i].freq); sprintf(term_freq, "%s", terms1.terms[i].term); strcat(term_vector, term_freq); } initResponseHeader(&headers); headers.processResponseBody = processResponseBody; if(linkNum>=2) { //sprintf(cur_url, "%s%s", orig_root, sessionLinks[linkNum-1]); sprintf(cur_url, "%s", links.urls[linkNum-1].url); sprintf(pre_url, "%s", links.urls[linkNum-2].url);#if MINUS_CURRENT //sprintf(search_query, "termvec:%s+-url:%s+-url:%s", term_vector, cur_url, pre_url); sprintf(search_query, "%s+-url:%s+-url:%s", term_vector, cur_url, pre_url);#else //sprintf(search_query, "termvec:%s", term_vector); sprintf(search_query, "%s", term_vector);#endif } else if(linkNum>=1) { sprintf(cur_url, "%s", links.urls[linkNum-1].url);#if MINUS_CURRENT //sprintf(search_query, "termvec:%s+-url:%s", term_vector, cur_url); sprintf(search_query, "%s+-url:%s", term_vector, cur_url);#else //sprintf(search_query, "termvec:%s", term_vector); sprintf(search_query, "%s", term_vector);#endif } sprintf(request_str, "%s%s%s", search_root, search_params, search_query); len = HTTPRequest(request_str, method, extraHeaders, &headers, timeout, content, 20481); if(len <= 0) return -1; strcpy(temp, content); //post processing content ptr = strstr(temp, form_bflag); if(ptr == NULL) { fprintf(stderr, "Strange, can't find search begin button.\n"); goto EXIT;; } pptr = ptr + strlen(form_bflag); ptr = strstr(pptr, form_eflag); if(ptr == NULL) { fprintf(stderr, "Strange, can't find search end button.\n"); goto EXIT;; } pptr = ptr + strlen(form_eflag); ptr = strstr(pptr, form_bflag); if(ptr != NULL) *ptr = '\0'; else { ptr = strstr(pptr, logo_flag); if(ptr == NULL) { fprintf(stderr, "Strange, can't find search logo.\n"); goto EXIT; } *ptr = '\0'; } strcpy(buf, pptr); return strlen(buf);EXIT: strncpy(buf, content, 10240); buf[10239] = '\0'; return strlen(buf);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -