📄 invert10_31.c
字号:
if (Index == NULL)
{
*error = EINDEXBUILD;
printf("error is: %d\n", EINDEXBUILD);
printf("内存不足!\n");
return NULL;
}
for(i = 0; i < (GBHTTOPNUM - GBHTBTMNUM + 1); i++)
{
Index[i] = (InvertNode *) malloc(( GBLWTOPNUM - GBLWBTMNUM + 1) * sizeof(InvertNode ));
if (Index[i] == NULL)
{
*error = EINDEXBUILD;
printf("error is: %d\n", EINDEXBUILD);
printf("内存不足!\n");
return NULL;
}
}
return Index;
}
/*该函数功能:建立英文倒排索引表,(索引号是GB码减去176和161)
入口参数:InvertNode *Index为待分配的空间
int * error 错误编码,函数执行正确为0
返回值: InvertNode * 分配内存后的地址
*/
InvertNode * EIdxBuild(InvertNode *Index, int * error)
{
int count = 0;
Index = (InvertNode *) malloc (26 * sizeof(InvertNode));
if (Index == NULL)
{
*error = EINDEXBUILD;
printf("error is: %d\n", EINDEXBUILD);
return NULL;
}
return Index;
}
/*该函数功能:建立倒排索引表,(索引号是GB码减去176和161)
入口参数:InvertNode **InvertIndex 为中文国标码索引待分配的空间
InvertNode *EIntIdx 为英文单词索引
const int DocCount 为一次处理的文档个数
const ForwardNode *fNode 为正向表所指向的指针
返回值: 为0,操作正常,否则错误
*/
int InvertIdxBld(InvertNode **InvertIndex ,InvertNode *EIntIdx ,const ForwardNode *fNode, const int DocCount)
{
/*int *IndexError;*/
WordNode *pCurWordNode;
DocNode *pCurDoc;
int flag;
int i,k,m;
unsigned char uTemp[MAXWORD + 1];
InvertNode *pInvertNode;
/*IndexError = (int *) malloc (sizeof(int));
*IndexError = 0;*/
if (fNode == NULL){
return EFNODE;
}
if (DocCount < 0) {
return EFNODE;
}
/*建立倒排索引*/
for(i = 0 ; i < DocCount; i++)
{
pCurWordNode = (fNode + i)->wFWordNode;
printf("\n正在建立文档: %s 的倒排索引\n", (fNode + i)->sFileURL );/**/
while(pCurWordNode != NULL)
{
sprintf(uTemp, pCurWordNode->sWords);
/*如果是汉字,将汉字写入汉字倒排索引*/
if((uTemp[0] <= GBHTTOPNUM) && (uTemp[0] >= GBHTBTMNUM) && (uTemp[1] <= GBLWTOPNUM) && (uTemp[1] >= GBLWBTMNUM))
{
/*如果是未出现的字*/
if(InvertIndex[uTemp[0] - GBHTBTMNUM][uTemp[1] - GBLWBTMNUM].lWordNum == 0)
{
pInvertNode = &InvertIndex[uTemp[0] - GBHTBTMNUM][uTemp[1] - GBLWBTMNUM];
pInvertNode->lWordNum = 1;
/*如果是单字,在单字的位置上,新添加文档节点*/
if (uTemp[2] == '\0')
{
pInvertNode->pDocNode = (DocNode *) malloc(sizeof(DocNode));
pCurDoc = pInvertNode->pDocNode;
strcpy(pInvertNode->sWords, uTemp);
pCurDoc->pNext = NULL;
pCurDoc->iPos = NULL;
}
/*如果不是单字就在后面连上一个InvertNode,并记录*/
else
{
pInvertNode->pNextNode = (InvertNode *) malloc (sizeof(InvertNode));
pInvertNode = pInvertNode->pNextNode;
strcpy(pInvertNode->sWords, uTemp);
pInvertNode->pNextNode = NULL;
pInvertNode->lDocNum = 0;
pInvertNode->pDocNode = (DocNode *) malloc (sizeof(DocNode));
pCurDoc = pInvertNode->pDocNode;
pCurDoc->pNext = NULL;
pCurDoc->iPos = NULL;
}
}
/*如果该字已出现过*/
else
{
pInvertNode = &InvertIndex[uTemp[0] - GBHTBTMNUM][uTemp[1] - GBLWBTMNUM];
/*寻找该词*/
flag = 0;
if (strcmp(uTemp,pInvertNode->sWords) == 0)
{
flag = 1;
}
else
{
while (pInvertNode->pNextNode != NULL)
{
/*如果该词出现过*/
if(strcmp(uTemp,pInvertNode->sWords) == 0)
{
flag = 1;
break;
}
pInvertNode = pInvertNode->pNextNode;
}
if (strcmp(uTemp,pInvertNode->sWords) == 0)
{
flag = 1;
}
}
/*如果该词是未出现的词*/
if(flag == 0)
{
(InvertIndex[uTemp[0] - GBHTBTMNUM][uTemp[1] - GBLWBTMNUM].lWordNum)++;
pInvertNode->pNextNode = (InvertNode *)malloc(sizeof(InvertNode));
pInvertNode = pInvertNode->pNextNode;
/*这句可要可不要,因为只在单字节点中记录以该字为头的词数,不在词节点中记录个数*/
pInvertNode->lWordNum = 1;
pInvertNode->lDocNum = 0;
pInvertNode->pNextNode = NULL;
strcpy(pInvertNode->sWords, uTemp);
/*分配文档节点的内存*/
pInvertNode->pDocNode = (DocNode *) malloc(sizeof(DocNode));
pCurDoc = pInvertNode->pDocNode;
pCurDoc->pNext = NULL;
}
/*如果以前出现过,则找到该词所在的pInvertNode,然后新添加文档节点*/
else
{
pCurDoc = pInvertNode->pDocNode;
/*找到链表的尾端*/
while (pCurDoc->pNext != NULL)
{
pCurDoc = pCurDoc->pNext;
}
pCurDoc ->pNext = (DocNode *) malloc (sizeof(DocNode));
pCurDoc = pCurDoc->pNext;
pCurDoc->pNext = NULL;
}
}
/*填入文档信息*/
pInvertNode->lDocNum ++;
strcpy(pCurDoc->sDocID,(fNode + i)->sDocID);
pCurDoc->fWeight = pCurWordNode->fWeight;
pCurDoc->iFreq = pCurWordNode->iFreq;
pCurDoc->iPos = (int *) malloc(pCurDoc->iFreq * sizeof(int));
for( k = 0; k < pCurWordNode->iFreq; k++)
{
pCurDoc->iPos[k] = pCurWordNode->iPos[k];
}
strcpy(pCurDoc->sFileURL, (fNode + i)->sFileURL);
}
/*如果正向表中是英文,则将其写入英文倒排索引*/
if((uTemp[0] <= 'z') && (uTemp[0] >= 'a') || (uTemp[0] <= 'Z') && (uTemp[0] >= 'A'))
{
/*将所有大写字母转换成小写字母*/
m = 0;
while(uTemp[m] != '\0')
{
if((uTemp[m] <= 'Z') && (uTemp[m] >= 'A'))
{
uTemp[m] = uTemp[m] + DIFLOWHIGA;
}
m++;
}
/*如果是未出现的字母*/
if(EIntIdx[uTemp[0] - LOWERA].lWordNum == 0)
{
pInvertNode = &EIntIdx[uTemp[0] - LOWERA];
pInvertNode->lWordNum = 1;
/*如果是单个字母,在单个字母的位置上,新添加文档节点*/
if (uTemp[1] == '\0')
{
pInvertNode->pDocNode = (DocNode *) malloc(sizeof(DocNode));
pCurDoc = pInvertNode->pDocNode;
strcpy(pInvertNode->sWords, uTemp);
pCurDoc->pNext = NULL;
pCurDoc->iPos = NULL;
}
/*如果不是单个字母就在后面连上一个InvertNode,并记录*/
else
{
pInvertNode->pNextNode = (InvertNode *) malloc (sizeof(InvertNode));
pInvertNode = pInvertNode->pNextNode;
strcpy(pInvertNode->sWords, uTemp);
pInvertNode->pNextNode = NULL;
pInvertNode->lDocNum = 0;
pInvertNode->pDocNode = (DocNode *) malloc (sizeof(DocNode));
pCurDoc = pInvertNode->pDocNode;
pCurDoc->pNext = NULL;
}
}
/*如果该字母已出现过*/
else
{
pInvertNode = &EIntIdx[uTemp[0] - LOWERA];
/*寻找该词*/
flag = 0;
if (strcmp(uTemp,pInvertNode->sWords) == 0)
{
flag = 1;
}
else
{
while (pInvertNode->pNextNode != NULL)
{
/*如果该词出现过*/
if(strcmp(uTemp,pInvertNode->sWords) == 0)
{
flag = 1;
break;
}
pInvertNode = pInvertNode->pNextNode;
}
if (strcmp(uTemp,pInvertNode->sWords) == 0)
{
flag = 1;
}
}
/*如果该词是未出现的词*/
if(flag == 0)
{
(EIntIdx[uTemp[0] - LOWERA].lWordNum)++;
pInvertNode->pNextNode = (InvertNode *)malloc(sizeof(InvertNode));
pInvertNode = pInvertNode->pNextNode;
/*这句可要可不要,因为只在单字节点中记录以该字为头的词数,不在词节点中记录个数*/
pInvertNode->lWordNum = 1;
pInvertNode->lDocNum = 0;
pInvertNode->pNextNode = NULL;
strcpy(pInvertNode->sWords, uTemp);
/*分配文档节点的内存*/
pInvertNode->pDocNode = (DocNode *) malloc(sizeof(DocNode));
pCurDoc = pInvertNode->pDocNode;
pCurDoc->iPos = NULL;
pCurDoc->pNext = NULL;
}
/*如果以前出现过,则找到该词所在的pInvertNode,然后新添加文档节点*/
else
{
pCurDoc = pInvertNode->pDocNode;
/*找到链表的尾端*/
while (pCurDoc->pNext != NULL)
{
pCurDoc = pCurDoc->pNext;
}
pCurDoc ->pNext = (DocNode *) malloc (sizeof(DocNode));
pCurDoc = pCurDoc->pNext;
pCurDoc->iPos = NULL;
pCurDoc->pNext = NULL;
}
}
/*填入文档信息*/
pInvertNode->lDocNum ++;
strcpy(pCurDoc->sDocID,(fNode + i)->sDocID);
pCurDoc->fWeight = pCurWordNode->fWeight;
pCurDoc->iFreq = pCurWordNode->iFreq;
pCurDoc->iPos = (int *) malloc(pCurDoc->iFreq * sizeof(int));
for( k = 0; k < pCurWordNode->iFreq; k++)
{
pCurDoc->iPos[k] = pCurWordNode->iPos[k];
}
strcpy(pCurDoc->sFileURL, (fNode + i)->sFileURL);
}
/*处理完一个节点,释放正向表词节点所占的内存*/
pCurWordNode = pCurWordNode->pnext;
}
}
/*if(IndexError)
free(IndexError);*/
return 0;
}
/* 将所给单链表的内容写到指定文件中,若文件超过指定大小,则新建文件。
* 新文件的名称为:原文件名+数字
*/
/* 返回文件大小 (字节)*/
long filesize(FILE *stream)
{
long curpos, length;
curpos = ftell(stream);
fseek(stream, 0L, SEEK_END);
length = ftell(stream) ;
fseek(stream, curpos, SEEK_SET);
return length;
}
/** 获取文件序号 **/
int get_file_seq_num(char * sFileName)
{
int i, length;
int ndotpos, numstartpos;
length = strlen(sFileName);
/* 确定文件后缀开始位置*/
for(i = length-1 ; (sFileName[i] != '.') && (i>0) ; i--) ;
if (i>0)
{
ndotpos = i;
}
else
ndotpos = length-1;
/* 从文件后缀开始位置往回搜索,找数字开始位置*/
i = ndotpos ;
while ((i >0) && (atoi(sFileName+i-1) !=0) )
{
i--;
}
numstartpos = i;
/* 返回文件序号, 若没有数字, 返回为0*/
return(atoi(sFileName+numstartpos)) ;
}
int create_next_file_name(char *sname_in, char ** sname_out)
{
int i, length, oldseq, newseq;
int ndotpos, numstartpos;
char *tmpstr;
length = strlen(sname_in);
tmpstr = (char *)malloc(sizeof(char)* (length +1)) ;
if(strcmp(sname_in, "10.txt") == 0)
{
i = 0;
}
/* 确定文件后缀开始的位置*/
for(i = length-1 ; (sname_in[i] != '.') && (i>0) ; i--) ;
if (i>0)
{
ndotpos = i;
}
else
{
ndotpos = length-1;
}
/*从文件后缀开始的位置往回搜索,找数字开始的位置*/
i = ndotpos ;
while ((i >0) && (atoi(sname_in+i-1) >= 0)&&(atoi(sname_in+i-1) <= 9))
{
i--;
}
if (i == 1)
{
if ((sname_in[0] >= '0') && (sname_in[0] <= '9'))
{
i = 0;
}
}
numstartpos = i;
for(i = numstartpos; i < ndotpos; i++)
{
tmpstr[i] = sname_in[i];
}
tmpstr[i] = '\0';
oldseq = atoi(tmpstr);
newseq = oldseq + 1;
itoa(newseq, *sname_out, 10);
sprintf(*sname_out, "%s%s", *sname_out, sname_in+ndotpos);
if(tmpstr)
free(tmpstr);
return 0 ;
}
/*该函数的功能:建立倒排索引时,当索引文档大小大于一定值时,写入另一个文档
本函数主要是得到当前目录下上一次建立索引时写过的文档,
即可能没有达到指定大小的文档的名字,
该名字保存在$curfile.txt文件中。
入口参数:sCurDir为当前索引目录
返回值:索引文件名字
*/
char* GetWrtFlName(const char *sCurDir)
{
char *sDir;
FILE *stream;
char *sFileName;
int count,i;
sFileName = (char *) malloc(sizeof(char) * MAXPATHLEN);
sDir = (char *) malloc(sizeof(char) * MAXPATHLEN);
strcpy(sDir, sCurDir);
strcat(sDir, "\\");
strcat(sDir, "$curfile.txt");
if((stream = fopen(sDir, "r")) ==NULL)
{
/*当第一次建立索引时,索引文件以1.txt开始*/
strcpy(sFileName, "1.txt");
if(sDir)
free(sDir);
return sFileName;
}
else
{
fscanf(stream, "%s", sFileName);
/*判断记录当前索引文件名字的索引文档是否正常,如果正常读出,
否则将sFileName置成1.txt*/
count = 0;
i = 0;
while(sFileName[i] != '\0')
{
i++;
if(sFileName[i] == '.')
{
count++;
}
}
if(count != 1)
{
strcpy(sFileName, "1.txt");
}
fclose(stream);
if(sDir)
free(sDir);
return sFileName;
}
}
/*该函数的功能:建立倒排索引时,当索引文档大小大于一定值时,写入另一个文档
本函数主要是记录当前所写的文档的名字,
该名字保存在$curfile.txt文件中。
入口参数:sFileName为当前的文件名字如1.txt,2,txt...等
sCurDir为当前索引目录如e:\invt\半
返回值:成功返回0
*/
int WrtFlName(const char *sFileName, const char *sCurDir)
{
char *sDir;
FILE *stream;
sDir = (char *) malloc(sizeof(char) * MAXPATHLEN);
strcpy(sDir, sCurDir);
strcat(sDir, "\\");
strcat(sDir, "$curfile.txt");
if ((stream = fopen(sDir, "w+")) == NULL)
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -