📄 webcom.cpp

📁 使用中科院ICTLAS和BM25算法的检索
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <math.h>
#include <malloc.h>
#include "webin.h"
#include "webcom.h"
#include "ICTCLAS30.h"

void getInfo(char buffer[], News * info)
{
	char *ptr = buffer, *ptrt;
	Reply *p = NULL;
	int i = 0;
	
	ptr = buffer;
	while((ptr = strstr(ptr, "<DIV class=title05>") )!= NULL)
	{
        if (p == NULL)
        {
            p = (Reply*) calloc (1, sizeof(Reply));
            p->pre = NULL;
	        info->rep = p;
        }
        else
        {
		    p->next = (Reply*) calloc (1, sizeof(Reply));
		    p->next->pre = p;
		    p = p->next;
        }
		p->parentID = 0;
		i++;
		p->ID = i;
		ptr = ptr + 19;
 		sscanf(ptr, "<H2><SPAN>%[^<]</SPAN>%*[^IP:]IP:%[^<]", p->postTime, p->IP);
		ptr = strstr(ptr, "<DIV class=cake09>") + 18;
		if ((ptrt = strstr(ptr, "<DIV class=cake06>")) == NULL)
			ptrt = strstr(ptr, "</DIV>");
		strncpy(p->comment, ptr, ptrt - ptr);
		ptr++;
	}
	p->next = NULL;
}

//以下为四种搜索条件
int searchInfo1(Reply * rep, char IP[], char keyWord[])
{
	if (strstr(rep->IP, IP) != NULL)
		return 1;
	return 0;
}
int searchInfo2(Reply * rep, char IP[], char keyWord[])
{
	if (strstr(rep->comment, keyWord) != NULL)
		return 1;
	return 0;
}
int searchInfo3(Reply * rep, char IP[], char keyWord[])
{
	if (strstr(rep->comment, keyWord) != NULL && strstr(rep->IP, IP) != NULL)
		return 1;
	return 0;
}
int searchInfo4(Reply * rep, char IP[], char keyWord[])
{
	if (strstr(rep->comment, keyWord) != NULL || strstr(rep->IP, IP) != NULL)
		return 1;
	return 0;
}

//删除回帖，不删除子贴以保留链表结构，减少读取数量
void deleteReply(Reply **rep)
{
	Reply *p = (*rep)->next;
	if ((*rep)->pre != NULL)
		(*rep)->pre->next = (*rep)->next;
	if ((*rep)->next != NULL)
		(*rep)->next->pre = (*rep)->pre;
	*rep = p;
}

//打印单个回帖
void output(Reply *rep) 
{
	printf("ID：%i\n", rep->ID);
	if (rep->parentID != 0)
		printf("对ID：%i的回复\n", rep->parentID);
	printf("IP：%s\n回复时间：%s\n正文：%s\n\n", rep->IP, rep->postTime, rep->comment);
}

//打印全部回帖
void printall(Reply *rep)
{
	Reply *p = rep;
	while(p != NULL)
	{
		output(p);
		p = p->next;
	}
}

//根据Rsearch的条件删除回帖
void deleteHX(Reply **rep,int (*Rsearch)(Reply *, char [], char[]),char IP[], char keyWord[])
{
	Reply *p = *rep;

	while (p != NULL)
	{
		if ((*Rsearch)(p, IP, keyWord)) 
		{
			printf("已删除：\n");
			output(p);
			if (p->pre == NULL)
				if (p->next != NULL)
					*rep = p->next;
				else 
					*rep = NULL;
			deleteReply(&p);
		}
		else 
			p = p->next;
	}

	printf("已完成\n\n");
}

//回帖
int replypost(Reply **rep, int parentID, char buffer[])
{

	int iID;
	Reply *p = *rep, *fp = NULL;

	while (1)
	{	
		if (p->ID == parentID)
			fp = p;
		if (p->next == NULL)
			break;
		else 
			p = p->next;
	}

	if (fp == NULL)
		return 0;
	iID = p->ID + 1;

	p->next = (Reply*) calloc (1, sizeof(Reply));
	p->next->pre = p;
	p = p->next;
	p->parentID = parentID;
	p->ID = iID;
	time_t lt = time(NULL);
	strcpy(p->postTime, ctime(&lt));
	p->postTime[strlen(p->postTime)-1] = '\0';
	strcpy(p->comment, buffer);

	return 1;
}

//将rep根据Score大小插入到index中
void insertIdx(Srch **index, Reply *rep, double Score)
{
	Srch *pi = *index, *pp;
	if (pi == NULL)
	{
		pi = (Srch*) calloc(1, sizeof(Srch));
		*index = pi;
		pi->rep = rep;
		pi->dScore = Score;
		return ;
	}
	if (Score > pi->dScore)
	{
		pp = (Srch*) calloc (1, sizeof(Srch));
		pp->next = pi;
		pp->rep = rep;
		pp->dScore = Score;
		*index = pp;
		return ;
	}
	while (pi->next != NULL && !(pi->dScore >= Score && pi->next->dScore < Score)) 
		pi = pi->next;
	if (pi->next == NULL)
	{
		pi->next = (Srch*) calloc(1, sizeof(Srch));
		pi = pi->next;
		pi->rep = rep;
		pi->dScore = Score;
	}
	else
	{
		pp = (Srch*) calloc (1, sizeof(Srch));
		pp->next = pi->next;
		pp->dScore = Score;
		pp->rep = rep;
		pi->next = pp;
	}
	return ;
}

Link* countkey1(char *content, Link *pNode)
{
	char *loc = content;
	Link *phNode = pNode;
	while (pNode != NULL)
	{
		while ((loc = strstr(loc , pNode->key)) != NULL)
		{
			pNode->oTimes++;
			loc++;
		}
		pNode = pNode->next;
		loc = content;
	}	
	return phNode;
}

//以下是对关键字在回帖中出现的数量进行统计的操作，由刘河编写。
/**********************************************************************************************************
本来统计关键字出现的次数用一个strstr函数是很容易实现的，但是效率很低下。此函数属于基层群众，调用的次数多
为了全面提升系统的速度，最好换一种算法，实现如下：
hash函数结构：
hash[x][0] 编码为x的字出现的次数。
hash[x][i] 第i个编码为x的字出现的位置
有了它，我们便可以快速定位关键字可能出现的位置（我只匹配首字母，也就是说，只有关键字的首字母才有hash值）
从而快速的计算出关键字出现的次数，有点类似hash表
**********************************************************************************************************/
//P.S：经实践证明……strstr在这种帖子中的效果还要好些……
//		   但方法很奇特，所以还是保存了下来。

//计算某一个字的编码，见课程简介。
int enCode(int highByte, int lowByte)
{
	int ans;
	ans = ((unsigned char)highByte - FIRSTCTOP) * RANGE + (unsigned char)lowByte - LASTCTOP;
	return ans;
}

//对关键字的首字开一个MAP
//tempLen[]函数记录了当前alloc的长度，每超界一次，便realloc一倍空间。
//mark[x]函数记录了编码为x的字是否为首字，若不是，值为0。
void lh_Hash(char *content,int *hash[], int tempLen[], int mark[])
{
	int i = 0, hashvalue, preSize, highByte, lowByte;
	while (content[i] != NULL)
	{
		highByte = content[i++];
		lowByte = content[i++];
		hashvalue = enCode(highByte, lowByte);
		if (hashvalue < 0 || mark[hashvalue] == 0) continue;
		hash[hashvalue][0]++;
		if (hash[hashvalue][0] >= tempLen[hashvalue]) 
		{
			preSize = _msize(hash[hashvalue]);
			hash[hashvalue] = (int* )realloc(hash[hashvalue], preSize + tempLen[hashvalue]*sizeof(int));
			tempLen[hashvalue] *= 2;
		}
		hash[hashvalue][hash[hashvalue][0]] = i-2;
	}
}
//判断是否和关键字完全匹配
int isCorreSpond (char keyWord[], char *content)
{
	int l = strlen(keyWord), i ,j;
	for (i = 0, j = 0; i < l; i++, j++)
		if (content[i] != keyWord[j]) return 0;
	return 1;
}
//计算有多少符合的关键字
int lh_Count(int *hash[], char *keyWord, char *content, int mark[])
{
	int i = 0, j =0, tot = 0, hashvalue;
	while (*keyWord >= 0) keyWord++;

	hashvalue = enCode(keyWord[0], keyWord[1]);
	if (mark[hashvalue] == 0) return 0;//判断此关键字是否在回帖中。
	int numsOfloop = hash[hashvalue][0];//循环次数为首字在文档中出现的次数。
	for ( i = 1; i <= numsOfloop; i++)
		if (isCorreSpond(keyWord, &content[hash[hashvalue][i]])) tot++;
	return tot;
}
//读入一段回帖和关键字链表，返回关键字出现的次数（修改链表中的oTimes）。
Link* countkey(char *content, Link *pNode)
{
	int *hash[NUM_OF_CHINESE];
	int tempLen[NUM_OF_CHINESE];
	int mark[NUM_OF_CHINESE];
	int index[NUM_OF_CHINESE];
	char buffer[20000], *keyWord;
	struct Link *temppNode = pNode;
	memset(tempLen, 0, sizeof(tempLen));
	memset(buffer, 0, sizeof(buffer));
	memset(mark, 0, sizeof(mark));
	memset(index, 0, sizeof(index));
	keyWord = (char *)calloc(14,sizeof(char));

	int i = 0, j = 0, hashvalue;

	while (content[i] != NULL)
	{
		if (content[i] < 0)
			buffer[j++] = content[i];
		i++;
	}//提取回帖中的所有中文（爪哇文被XX掉）
	
	buffer[j] = 0;
	while (temppNode != NULL)
	{
		strcpy(keyWord, temppNode->key);
		while (*keyWord >= 0) keyWord++;
12 下一页
💿 文件大小 2577 K
👤 上传用户 flyhack007
📂 所属分类多国语言处理
🏷️ 相关标签

#ICTLAS #BM #25 #算法
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -