📄 invert10_31.c
字号:
/*本程序试验更新和建立倒排索引,该程序添加了英文索引处理,
在索引目录下存储$curfile.txt文件,记录当前写的索引文件
2006_10_4 程序重新复查内存情况,主要为了解决倒排索引中可能存在的内存泄漏问题,另外去掉内存中不相关的函数
2006_10_8写更新倒排程序,其中idx.txt文件每生成多篇文档后写一次idx.txt文件。
对于$curfile.txt文件,第一行记录当前写的倒排文件的名字,第二行记录已经更新过的文件的名字,
下一次更新时从该文件开始更新
基于以上要求,修改程序思路如下:
1、去掉建立倒排索引时对idx.txt文件写的操作,idx.txt文件只在更新时生成
2、写独立的更新函数,该函数是对当前目录下的所有大于$update.txt中记录的文件(除idx.txt和$curfile.txt)文件内容
进行重新整理的过程,重整主要是将相同的词放在一起。
建倒排索引时不写idx.txt
*/
#include "stdio.h"
#include "seng.h"
#include "string.h"
#include "math.h"
#include "malloc.h"
#include "stdlib.h"
#include "assert.h"
#include "direct.h"
#define MALCSIZE 100 /*一次分配的内存大小*/
#define RELCSIZE 100 /*当一次分配的内存不够时,二次分配时的加数*/
#define SHORTSIZE 20 /*一个词的最大词长10*/
#define INDEXNUMBER 6768 /*简体中文字的个数*/
#define GBLWBTMNUM 161 /*简体中文国标码低位最小值*/
#define GBLWTOPNUM 254 /*简体中文国标码低位最大值*/
#define GBHTBTMNUM 176 /*简体中文国标码高位最小值*/
#define GBHTTOPNUM 247 /*简体中文国标码高位最大值*/
#define MAXPATHL 50 /*最大路径*/
#define MAXPATH 50 /*最大路径*/
#define MAXWORD 80 /*最长的词长为40,一篇文档中最多出现的相同的字头的词的个数*/
#define MAXWORDONE 50 /*以某一个字开头的可能有的词数*/
#define MAXLINEFILE 3 /*倒排索引文件的最大行数*/
/* #define MAXWORDLEN 50 最大词长*/
#define MAXNUMBER 50 /*最大文档数*/
#define MAXFILENAME 20 /*最大文档数*/
#define MAXPOS 400 /*一个词在一篇文章中最多出现的次数*/
#define LOWERA 97 /*字母a所对应的的ASCII码*/
#define DIFLOWHIGA 32 /*大写字母和小写字母ASCII码的差值*/
#define MAXLINELEN 1000 /*倒排文档中每行最多出现的字符个数*/
#define MAXBUFFER 2000 /*最大缓存区, 要注意大小*/
int realloccount = 10;
/*将当前的词放入正向表中,sWords词内容, iPos词在原文件中的位置,正常返回0*/
int InstWd2FwList(ForwardNode *pfNode, char *sWords, char *sFileURL, int iPos, char type)
{
WordNode *pWordNode;
/*在链表中寻找该词,如果该词出现了就在原节点上将频率加1,如果该词没有出现则新创建并添加节点*/
if((pfNode == NULL) || (sWords == NULL) || (sFileURL == NULL) || (iPos < 0))
{
printf("error is: %d\n", EFWLIST);
return EFWLIST;
}
/*得到词链表的首节点*/
if (pfNode->wFWordNode == NULL)
{
pfNode->wFWordNode = (WordNode *) malloc(sizeof(WordNode));
pWordNode = pfNode->wFWordNode;
pWordNode->pnext = NULL;
pWordNode->fWeight = 0;
pWordNode->iFreq = 1;
pWordNode->iPos = (int *) malloc(MAXPOS * sizeof(int));
pWordNode->iPos[0] = iPos;
pWordNode->sResever = '0';
strcpy(pWordNode->sFileURL, sFileURL);
pWordNode->sWords = (char *) malloc( MAXWORDLEN * sizeof(char));
strcpy(pWordNode->sWords, sWords);
return 0;
}
pWordNode = pfNode->wFWordNode;
while ((strcmp(pWordNode->sWords, sWords) != 0) && (pWordNode->pnext != NULL))
{
pWordNode = pWordNode->pnext;
}
/*找到该词*/
if ((strcmp(pWordNode->sWords, sWords)) == 0)
{
/*词频加1*/
(pWordNode->iFreq)++;
/*记住该词的位置*/
if (pWordNode->iFreq < MAXPOS)
{
pWordNode->iPos[pWordNode->iFreq - 1] = iPos;
strcpy(pWordNode->sFileURL, sFileURL);
}
else
{
pWordNode->iPos = (int *) realloc(pWordNode->iPos, pWordNode->iFreq * sizeof(int));
strcpy(pWordNode->sFileURL, sFileURL);
if(pWordNode->iPos == NULL)
{
printf("error is: %d\n", EMALLOC);
printf("内存不足!\n");
return EMALLOC;
}
}
}
/*该链表中没有该词,创建该词节点*/
else
{
pWordNode->pnext = (WordNode *) malloc(sizeof(WordNode));
if(pWordNode->pnext == NULL)
{
printf("error is: %d\n", EMALLOC);
printf("内存不足!\n");
return EMALLOC;
}
pWordNode = pWordNode->pnext;
pWordNode->pnext = NULL;
pWordNode->fWeight = 0;
pWordNode->iFreq = 1;
pWordNode->sResever = type;
pWordNode->iPos = (int *) malloc(MAXPOS * sizeof(int));
if(pWordNode->iPos == NULL)
{
printf("error is: %d\n", EMALLOC);
printf("内存不足!\n");
return EMALLOC;
}
pWordNode->iPos[0] = iPos;
strcpy(pWordNode->sFileURL, sFileURL);
pWordNode->sWords = (char *) malloc( MAXWORDLEN * sizeof(char));
if(pWordNode->sWords == NULL)
{
printf("error is: %d\n", EMALLOC);
printf("内存不足!\n");
return EMALLOC;
}
strcpy(pWordNode->sWords, sWords);
}
return 0;
}
/*计算每个词在文章中的权值
入口参数:ForwardNode *pfNode 正向表的指针
ipos该段文章中一共所含的词的个数
*/
int Weight(ForwardNode *pfNode, long lPos)
{
WordNode *pWNode;
double lFreq;
double dPos;
if(pfNode == NULL) {
printf("EWEIGHT\n");
return EWEIGHT;
}
dPos =lPos;
pWNode = pfNode->wFWordNode;
while(pWNode != NULL)
{
/*如果是文档题目*/
if(pWNode->sResever == 't')
{
pWNode->fWeight = 0.7;
pWNode = pWNode->pnext;
continue;
}
/*如果是文档作者*/
else if(pWNode->sResever == 'a')
{
pWNode->fWeight = 1.0;
pWNode = pWNode->pnext;
continue;
}
/*如果是文档摘要*/
else if(pWNode->sResever == 'b')
{
pWNode->fWeight = 0.5;
pWNode = pWNode->pnext;
continue;
}
/*如果是文档关键词*/
else if(pWNode->sResever == 'k')
{
pWNode->fWeight = 1.0;
pWNode = pWNode->pnext;
continue;
}
else
{
lFreq = pWNode->iFreq;
pWNode->fWeight = lFreq/dPos;
pWNode = pWNode->pnext;
}
}
return 0;
}
/*功能:读分词后的内存,该函数包括词的个数和位置的统计,位置只统计该词在正向表中的位置
入口参数:SegBuf 分词后的buffer
sDocID 文档编号,(目前以自然数统计)
pfNode 正向表在内存中的指针
返回值:正确返回0,错误返回错误码
*/
int SegBufPos(char *SegBuf, char *sDocID, char *sFileURL, ForwardNode *pfNode)
{
long DCount = 0, lSegBufLen, lLoop;
char sWords[MAXWORDLEN];
int iPosTemp, iWordLen;
long lPos;
char type;
unsigned char temp[2];
if((SegBuf == NULL) || (sDocID == NULL) || (sFileURL == NULL) || (pfNode == NULL)){
printf("error is: %d\n", SEGFLRD);
return SEGFLRD;
}
/*记录文档个数*/
if(MAXDOCID > strlen(sDocID)){
strcpy(pfNode->sDocID, sDocID);
}
else{
printf("error is: %d\n", EDOCID);
printf("MAXDOCID is not enough!\n");
return EDOCID;
}
if(MAXPATHL > strlen(sDocID)){
strcpy(pfNode->sFileURL,sFileURL);
}
else{
printf("error is: %d\n", EPATHLEN);
printf("MAXPATHL is not enough!\n");
return EPATHLEN;
}
printf("begin to SegBufPos\n");
lPos = 0;
pfNode->wFWordNode = NULL;
lSegBufLen = strlen(SegBuf);
if (lSegBufLen <= 0) {
printf("error is: %d\n", SEGFLRD);
return SEGFLRD;
}
/*printf("%s\n", SegBuf);*/
type = '0';
for( lLoop = 0, iWordLen = 0; lLoop < lSegBufLen; lLoop = lLoop + iWordLen)
{
while (SegBuf[lLoop] == ' ') {
lLoop++;
}
if(lLoop >= lSegBufLen) break;
sscanf((SegBuf + lLoop),"%s ", sWords);
iWordLen = strlen(sWords) + 1;
if (strcmp(sWords,"末##末") == 0)
{
continue;
}
if (strcmp(sWords,"title@title") == 0)
{
type = 't';
continue;
}
if (strcmp(sWords,"author@author") == 0)
{
type = 'a';
continue;
}
if (strcmp(sWords,"keyword@keyword") == 0)
{
type = 'k';
continue;
}
if (strcmp(sWords,"abstract@abstract") == 0)
{
type = 'b';
continue;
}
if (strcmp(sWords,"text@text") == 0)
{
type = 'x';
continue;
}
if (strcmp(sWords,"毕红") == 0)
{
printf("break!\n");
}
temp[0] = sWords[0];
temp[1] = sWords[1];
iPosTemp = 0;
/*找到该词的位置*/
lPos++;
/*如果是汉字*/
if ((temp[0] <= GBHTTOPNUM) && (temp[0] >= GBHTBTMNUM) && (temp[1] <= GBLWTOPNUM ) && (temp[1] >= GBLWBTMNUM ))
{
/*将该词插入汉字正向表*/
if (InstWd2FwList(pfNode, sWords, sFileURL, lPos, type) != 0)
{
printf("error is: %d\n", EFWWRT);
return EFWWRT;
}
}
/*如果是英文单词*/
else if ((temp[0] >= 'a') && (temp[0] <= 'z') || ((temp[0] >= 'A') && (temp[0] <= 'Z')))
{
/*将该词放入正向表*/
if (InstWd2FwList(pfNode, sWords, sFileURL, lPos, type) != 0)
{
printf("error is: %d\n", EFWWRT);
return EFWWRT;
}
}
}
printf("SegBufPos is over!\n");
/*计算各个词的权值*/
if( Weight(pfNode, lPos) != 0) {return EWEIGHT;}
printf("weight calculate is over!\n");
return 0;
}
/*
功能:建立正向表,
记录处理的文档总数的文件名,即文档编号,
入口参数:
sFilePath 记录文档总个数文件所在的路径
sRcdFile 记录文档总个数的文件
sScFile 记录原文件位置的路径
fNode 正向表指针
DocCount 文档个数
sResult 分词后的文档缓冲区
返回值:成功返回0,否则返回错误编码
*/
int ForwardBld(char *sFilePath, char *sRcdFile, char *sScFile, ForwardNode **fNode, long DocCount, char *strBuf)
{
FILE *pFlCount;
char *sDocCount;
char *RcdFile;
char *sFileURL;
int iCurDoc;
long ltempCount;
int i;
if((sRcdFile == NULL) || (sScFile == NULL) || (sFilePath == NULL) || (fNode == NULL) || (strBuf == NULL))
{
printf("建立正向表时入口参处有误!\n");
return EFWBLDPAR;
}
if(DocCount <= 0)
{
printf("建立正向表时入口参处有误!\n");
return EFWBLDPAR;
}
RcdFile = (char *) malloc(MAXPATHLEN * sizeof(char ));
sDocCount = (char *) malloc(MAXNUMBER * sizeof(char));
sFileURL = (char *) malloc(MAXPATHLEN * sizeof(char));
if((RcdFile == NULL) || (sDocCount == NULL) || (sFileURL == NULL))
{
printf("内存不足!\n");
return EMEM;
}
/*得到记录系统总共文档数的文件名*/
strcpy(RcdFile, sFilePath);
strcat(RcdFile,"\\");
strcat(RcdFile, sRcdFile);
/*记录总共处理的文档个数*/
if( (pFlCount = fopen( RcdFile, "r" )) == NULL )
{
if((pFlCount = fopen(RcdFile, "a+")) == NULL)
{
printf("record.txt cann't open!\n");
/*return EFILEOPEN;*/
}
ltempCount = DocCount;
iCurDoc = 0;
itoa(ltempCount, sDocCount, 10);
if(pFlCount)
{
fprintf(pFlCount, "%s ", sDocCount);
fclose(pFlCount);
}
}
/*读出处理过的文档个数*/
else
{
fscanf(pFlCount,"%s ",sDocCount);
if(strlen(sDocCount) > MAXNUMBER)
{
/*文件指针退回*/
i = strlen(sDocCount);
i = -i;
fseek(pFlCount, i, 1);
sDocCount = (char *) realloc (sDocCount, (MAXNUMBER + RELCSIZE ) * sizeof(char));
fscanf(pFlCount,"%s ",sDocCount);
}
ltempCount = atol(sDocCount);
iCurDoc = ltempCount;
ltempCount = ltempCount + DocCount;
itoa(ltempCount, sDocCount, 10);
fclose(pFlCount);
if( (pFlCount = fopen( RcdFile, "w+" )) == NULL )
{
printf("record.txt can't open! (else) \n");
return EFILEOPEN;
}
fprintf(pFlCount, "%s ", sDocCount);
fclose(pFlCount);
}
printf("当前正在处理文档:%s\n", sDocCount);
*fNode = (ForwardNode *) malloc((DocCount) * sizeof(ForwardNode));
if(*fNode == NULL)
{
return EMEM;
}
/*得到分词前原文件名字*/
strcpy(sFileURL, sScFile);
for(i = 0; i < DocCount; i++)
{
/* 当前目录下的分词结果文件以数字命名
itoa(i,sFileName, 10);
strcat(sFileName,".txt");
*/
if(itoa(iCurDoc + i,sDocCount, 10) == NULL)
{
return EITOA;
}
if(SegBufPos(strBuf, sDocCount, sFileURL, *fNode + i) != 0)
{
printf("%d\n", ERSEGFILE);
printf("segbufPos return error!\n ");
return ERSEGFILE;
}
}
if(RcdFile)
free(RcdFile);
if(sDocCount)
free(sDocCount);
if(sFileURL)
free(sFileURL);
return 0;
}
/*该函数功能:建立汉字倒排索引表,(索引号是GB码减去176和161)
入口参数:InvertNode **Index为待分配的空间
int * error 错误编码,函数执行正确为0
返回值: InvertNode ** 分配内存后的地址
*/
InvertNode ** IndexBuild(InvertNode **Index, int * error)
{
int i;
int count = 0;
Index = (InvertNode **) malloc ((GBHTTOPNUM - GBHTBTMNUM + 1) * sizeof(InvertNode *));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -