⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 invert10_31.c

📁 中文信息处理
💻 C
📖 第 1 页 / 共 4 页
字号:
	FILE *pTxtFile;
	
	if( (pTxtFile  = fopen( sFileName, "wt" )) == NULL )
	{
		return 0;
	}
	fprintf(pTxtFile,"%s",StringBuffer);
	fclose(pTxtFile);
	return 0;
}



/*功能:释放文档内存*/
int MyDocFree(DocNode *pDocNode)
{
	DocNode *preDocNode;
	if (pDocNode == NULL) {
		return EPAR;
	}
	if (pDocNode->pNext == NULL) {
		if(pDocNode->iPos)
			free(pDocNode->iPos);
		pDocNode->iPos = NULL;
		if(pDocNode)
			free(pDocNode);
		return 0;
	}
	preDocNode = pDocNode;
	pDocNode = pDocNode->pNext;
	while (pDocNode->pNext != NULL)
	{
		if(pDocNode->iPos)
			free(pDocNode->iPos);
		pDocNode->iPos = NULL;
		if(pDocNode)
			free(pDocNode);
		pDocNode = preDocNode->pNext;
	}
	if(pDocNode->iPos)
		free(pDocNode->iPos);
	pDocNode->iPos = NULL;
	if(pDocNode)
		free(pDocNode);
	if(preDocNode->iPos)
		free(preDocNode->iPos);
	preDocNode->iPos = NULL;
	preDocNode->pNext = NULL;
	if(preDocNode)
		free(preDocNode);
	return 0;
}
/*功能:释放倒排表内存*/
int MyInvtFree(InvertNode *pInvtNode)
{
	DocNode *pDocNode;
	InvertNode *preInvtNode;
	if (pInvtNode ==  NULL)
	{
		return EPAR;
	}
	if (pInvtNode->pNextNode == NULL)
	{
		/*首先释放掉当前结点的DocNode链表*/		
		pDocNode = pInvtNode->pDocNode;
		MyDocFree(pDocNode);
		pInvtNode->pDocNode = NULL;
		/*释放当前倒排结点*/		
		return 0;
	}
	
	preInvtNode = pInvtNode;
	pInvtNode = pInvtNode->pNextNode;
	while (pInvtNode->pNextNode != NULL)
	{
		preInvtNode->pNextNode = pInvtNode->pNextNode;
		/*首先释放掉当前结点的DocNode链表*/
		pDocNode = pInvtNode->pDocNode;
		MyDocFree(pDocNode);
		pInvtNode->pDocNode = NULL;		
		/*释放当前倒排结点*/		
		pInvtNode->pNextNode = NULL;
		if(pInvtNode)
			free(pInvtNode);
		pInvtNode = preInvtNode->pNextNode;
	}
	
	/*首先释放掉当前结点的DocNode链表*/		
	pDocNode = pInvtNode->pDocNode;
	MyDocFree(pDocNode);
	pInvtNode->pDocNode = NULL;	
	/*释放当前倒排结点*/	
	if(pInvtNode)
		free(pInvtNode);
	
	/*首先释放掉当前结点的DocNode链表*/		
	pDocNode = preInvtNode->pDocNode;
	MyDocFree(pDocNode);
	preInvtNode->pDocNode = NULL;	
	/*释放当前倒排结点*/		
	preInvtNode->pNextNode = NULL;
	preInvtNode = NULL;
	return 0;
}


/*该函数功能:建立倒排索引表,(索引号主要是GB码减去176和161)
入口参数:fNode正向表以连表的形式读入,
          DocCount文档个数
*/
int InvertBuild(const ForwardNode *fNode, const int DocCount, char *filepath)
{
	InvertNode **InvertIndex, *EIntIdx;
	int *IndexError;
	InvertNode *pCurIvtNode;
	char * sFileName, *sCurDir;
	unsigned char uTemp[3];
	int i,j;
	int count;
	int test = 0;
	char *sCurFile;

	if(fNode == NULL)
	{
		printf("error is: %d\n", EFNODE);
		printf("正向表不存在!\n");
		return EFNODE;
	}
	if(DocCount < 0)
	{
		printf("error is: %d\n", EDOCNUM);
		return EDOCNUM;
	}
	IndexError = (int *) malloc (sizeof(int));
	sFileName = (char *) malloc (MAXPATHLEN * sizeof(char));
	sCurDir = (char *) malloc (MAXPATH * sizeof(char));
	sCurFile = (char *) malloc(MAXPATH * sizeof(char));
	*IndexError = 0;

	/*首先调用汉字的索引建立函数*/

	InvertIndex = NULL;	
	InvertIndex = IndexBuild(InvertIndex, IndexError);
	if (*IndexError != 0)
	{
		return EINDEXBUILD;
	}

	for(i = 0; i < (GBHTTOPNUM - GBHTBTMNUM + 1) ; i++)
		for(j = 0; j < (GBLWTOPNUM - GBLWBTMNUM + 1) ; j++)
		{
			InvertIndex[i][j].lWordNum = 0;
			InvertIndex[i][j].pDocNode = NULL;
			InvertIndex[i][j].pNextNode = NULL;
			InvertIndex[i][j].lDocNum = 0;
			strcpy(InvertIndex[i][j].sWords,"\0");
		}


	/*调用英文的索引建立函数,26个英文字母*/
	EIntIdx = NULL;
	EIntIdx = EIdxBuild(EIntIdx, IndexError);
	if (*IndexError != 0)
	{
		return EINDEXBUILD;
	}
	for(i = 0; i < 26 ; i++)
	{
		EIntIdx[i].lWordNum = 0;
		EIntIdx[i].pDocNode = NULL;
		EIntIdx[i].pNextNode = NULL;
		EIntIdx[i].lDocNum = 0;
		strcpy(EIntIdx[i].sWords, "\0");
	}

	/*将正向表内容写入倒排索引*/
	if(InvertIdxBld(InvertIndex, EIntIdx, fNode, DocCount) != 0)
	{
		printf("error is InvertIdxBld, \n the Number is: %d\n", EINDEXBUILD);
		return EINDEXBUILD;
	}

	count = 0;

	/*将国标汉字倒排索引写入文件*/
	for(i = 0; i < (GBHTTOPNUM - GBHTBTMNUM + 1) ; i++)
		for(j = 0; j < (GBLWTOPNUM - GBLWBTMNUM + 1) ; j++)
		{
			if (InvertIndex[i][j].lWordNum != 0)
			{
				pCurIvtNode = &InvertIndex[i][j];
				strcpy(sFileName, filepath);
				mkdir(sFileName);
				uTemp[0] = (i + GBHTBTMNUM );
				uTemp[1] = (j + GBLWBTMNUM );
				uTemp[2] = '\0';
				/*printf("%c%c\n",uTemp[0],uTemp[1]);*/
				if(MAXPATH > (strlen(sFileName) + strlen(uTemp)))
				{
					strcat(sFileName, uTemp);
				}
				else
				{
					sFileName = (char *) realloc (sFileName, (MAXPATH + RELCSIZE) * sizeof(char));
					if(sFileName == NULL)
					{
						printf("error is memory is not enough!\n");
						return EMALLOC;
					}
					strcat(sFileName, uTemp);
				}

				if(MAXPATH > strlen(sFileName))
				{
					strcpy(sCurDir, sFileName);
				}
				else
				{
					sCurDir = (char *) realloc (sCurDir, (MAXPATH + RELCSIZE) * sizeof(char));
					if(sCurDir == NULL)
					{
						printf("error is memory is not enough!\n");
						return EMALLOC;
					}
					strcpy(sCurDir, sFileName);
				}

				mkdir(sCurDir);
				if(sFileName)
					free(sFileName);
				//得到当前应该写入的倒排索引文档的名字
				sFileName = GetWrtFlName(sCurDir);

				if (wrtlst(pCurIvtNode,sFileName,sCurDir, 10000) != 0)
				{
					printf("error is GB invert Index write, the number is: %d\n", EFILEWRT);
					return EFILEWRT;
				}
				count++;
			}
		}

	/*将英文词写入倒排文件*/
	for(i = 0; i < 26 ; i++)
	{
		if (EIntIdx[i].lWordNum != 0)
		{
			pCurIvtNode = &EIntIdx[i];

			/*目前假定目录为e:\invt\*/
			strcpy(sFileName, filepath);
			mkdir(sFileName);
			uTemp[0] = (i + LOWERA ) ;
			uTemp[1] = '\0';
			/*printf("%c%c\n",uTemp[0],uTemp[1]);*/
			/*组成二级目录*/
			if(MAXPATH > (strlen(sFileName) + strlen(uTemp)))
			{
				strcat(sFileName, uTemp);
			}
			else
			{
				sFileName = (char *) realloc (sFileName, (MAXPATH + RELCSIZE) * sizeof(char));
				if(sFileName == NULL)
				{
					printf("error is memory is not enough!\n");
					return EMALLOC;
				}
				strcat(sFileName, uTemp);
			}

			if(MAXPATH > strlen(sFileName))
			{
				strcpy(sCurDir, sFileName);
			}
			else
			{
				sCurDir = (char *) realloc (sCurDir, (MAXPATH + RELCSIZE) * sizeof(char));
				if(sCurDir == NULL)
				{
					printf("error is memory is not enough!\n");
					return EMALLOC;
				}
				strcpy(sCurDir, sFileName);
			}

			mkdir(sCurDir);
			/*初始的倒排文件名字为1.txt*/
			if(sFileName)
				free(sFileName);
			//得到当前应该写入的倒排索引文档的名字
			sFileName = GetWrtFlName(sCurDir);
			/*得到当前路径的名字------end*/
			if (wrt2engfile(pCurIvtNode,sFileName,sCurDir, 10000) != 0)
			{
				printf("error is English invert Index write, the number is: %d\n", EFILEWRT);
				return EFILEWRT;
			}
			count++;
		}
	}
	/*将英文词写入倒排文件---------end*/
	printf("The number of word in file is: %d\n", count);
	/*将倒排表所占内存释放*/
	for(i = 0; i < (GBHTTOPNUM - GBHTBTMNUM + 1) ; i++)
	{
		for(j = 0; j < (GBLWTOPNUM - GBLWBTMNUM + 1) ; j++)
		{
			if (InvertIndex[i][j].lWordNum != 0)
			{
				MyInvtFree(&InvertIndex[i][j]);
			}
		}
		if(InvertIndex[i])
			free(InvertIndex[i]);		
	}

	for(i = 0; i < 26; i++)
	{
		if (EIntIdx[i].lWordNum != 0) 
		{
			MyInvtFree(&EIntIdx[i]);
		}
	}
	printf("free is over!\n");
	if(InvertIndex)
		free(InvertIndex);
	if(IndexError)
		free(IndexError);
	if(sFileName)
		free(sFileName);
	if(sCurDir)
		free(sCurDir);
	if(sCurFile)
		free(sCurFile);
	if(EIntIdx)
		free(EIntIdx);
	printf("InvertBuild is over!\n");
	return 0;
}



/*功能:释放正向表内存*/
int Myfree(ForwardNode *fNode, int DocCt)
{
	WordNode *pWordNode, *preWordNode;
	int i;

	if(!fNode->wFWordNode) return -1;
	for(i = 0; i < DocCt; i++)
	{
		pWordNode = (fNode + i)->wFWordNode;
		preWordNode = pWordNode;
		if (pWordNode->pnext == NULL) {
			if(pWordNode->iPos)
				free(pWordNode->iPos);
			if(pWordNode->sWords)
				free(pWordNode->sWords);
			continue;
		}
		pWordNode = pWordNode->pnext;
		while (pWordNode->pnext != NULL) {
			preWordNode->pnext = pWordNode->pnext;
			if (pWordNode->iPos)
				free(pWordNode->iPos);		
			if(pWordNode->sWords)
				free(pWordNode->sWords);		
			pWordNode->pnext = NULL;
			if(pWordNode)
				free(pWordNode);
			pWordNode = preWordNode->pnext;
		}
		if(pWordNode->iPos)
			free(pWordNode->iPos);
		if(pWordNode->sWords)
			free(pWordNode->sWords);		
		pWordNode->pnext = NULL;
		if(pWordNode)
			free(pWordNode);
		if(preWordNode->iPos)
			free(preWordNode->iPos);
		if(preWordNode->sWords)
			free(preWordNode->sWords);
		preWordNode->pnext = NULL;
		(fNode + i)->wFWordNode = NULL;
	}

	
	return 0;
}


/*功能:建立倒排表
  入口参数:strBuf  分词后的缓存
            DocCt   本次处理的文档数
			sSource 源文档所在的路径和名称
*/
int invt_main(char *strBuf, int DocCt, char *sSource, long lPos)
{
	int *DocCount, m;
	char *sCurDir;
	char *sRcdFile, *sFilePath, *sScFile;
	ForwardNode **fNode;

	DocCount = (int *) malloc (sizeof(int));
	sCurDir = (char *) malloc(MAXPATH * sizeof(char));
	sFilePath = (char *) malloc(MAXPATH * sizeof(char));
	sRcdFile = (char *) malloc(MAXPATH * sizeof(char));
	sScFile = (char *) malloc(MAXPATH * sizeof(char));
	
	fNode = (ForwardNode **) malloc (sizeof(ForwardNode *));
	*fNode = NULL;


	/*建立正向表--------*/	
	/*当前路径*/
	/*strcpy(sFilePath,"d:\\invt1_1020_Finish_0");*/
	/*strcpy(sFilePath,"d:\\tempinvt");*/
	/*strcpy(sFilePath,"d:\\invt1_1020_Finish_2");*/
	/*strcpy(sFilePath,"d:\\invt1_1020_Finish_3");*/
	/*strcpy(sFilePath,"d:\\invt1_1102_Finish_4");*/
	/*strcpy(sFilePath,"d:\\invt1_1020_Finish_5");*/
     strcpy(sFilePath,"d:\\invt0_4");
	/* strcpy(sFilePath, "d:\\invt_for_search");*/




	/*存储文档个数的文件名字*/
	strcpy(sRcdFile, "Record.txt");
	m = ForwardBld(sFilePath, sRcdFile, sSource, fNode, DocCt, strBuf);
	/*建立正向表--------end*/
	strcat(sFilePath,"\\");	

	/*建立倒排索引,索引用GB码减去176和161,写倒排表,当倒排表超过最大行数后,另写一个文件。
	  这里不需要以双指针做参数,只需将地址值传入即可*/
	if(m == 0)
		InvertBuild(*fNode, DocCt, sFilePath);
	else
		printf("建立正向表错误!\n");

	/*将正向表内存释放,在主函数中完成索引释放*/
	Myfree(*fNode, DocCt);
	if(*fNode)
		free(*fNode);
	if(fNode)
		free(fNode);
	if(sCurDir)
		free(sCurDir);
	if(sFilePath)
		free(sFilePath);
	if(sRcdFile)
		free(sRcdFile);
	if(sScFile)
		free(sScFile);
	if(DocCount)
		free(DocCount);

    printf("invt_main is over!\n\n\n");
	return 0;
}



⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -