⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dealtxtfile.h

📁 基于最短路径的中文分词
💻 H
字号:
//文本处理

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

FILE *fsourcetxt;
int sentenlen,filepos,dictlen,punctulen,nontxtbuflen,filelen;
int bjpunctulen,bjmarklen;
int txtflag,nontxtflag,punctuflag;
char (*dictbuf)[17],(*punctubuf),(*nontxtbuf),punctutmp[3];
char (*bjpunctubuf),(*bjmarkbuf);

void loaddict()
{
	FILE *fdict,*fpunctu,*fbjpunctu,*fbjmark;    

	dictbuf=new char[200000][17];
	punctubuf=new char[100];
	bjpunctubuf=new char[100];
	bjmarkbuf=new char[100];

	//把字典读入dictbuf
	if((fdict=fopen("dict.txt","r"))==NULL){
		printf("cannot open dict!%s\n");
		exit(0);
	}
	dictlen=0;
	while(!feof(fdict)){
		fscanf(fdict,"%s",dictbuf[dictlen]);
		dictlen++;
	}
	fclose(fdict);

	//读入标点库
	if((fpunctu=fopen("punctu.txt","r"))==NULL){
		printf("cannot open punctu.txt file!%s\n");
		exit(0);
	}
	fseek(fpunctu,0L,SEEK_END);
	punctulen=ftell(fpunctu);
	fseek(fpunctu,0L,SEEK_SET);
	fread(punctubuf,1,punctulen,fpunctu);
	fclose(fpunctu);//需要处理的地方

	//读入半角标点字符
	if((fbjpunctu=fopen("bjpunctu.txt","r"))==NULL){
		printf("cannot open bjpunctu.txt file!%s\n");
		exit(0);
	}
	fseek(fbjpunctu,0L,SEEK_END);
	bjpunctulen=ftell(fbjpunctu);
	fseek(fbjpunctu,0L,SEEK_SET);
	fread(bjpunctubuf,1,bjpunctulen,fbjpunctu);
	fclose(fbjpunctu);

	//读入半角字符
	if((fbjmark=fopen("bjmark.txt","r"))==NULL){
		printf("cannot open bjmark.txt file!%s\n");
		exit(0);
	}
	fseek(fbjmark,0L,SEEK_END);
	bjmarklen=ftell(fbjmark);
	fseek(fbjmark,0L,SEEK_SET);
	fread(bjmarkbuf,1,bjmarklen,fbjmark);
	fclose(fbjmark);

	//读入全角标点字符???未做还是放在标点库里面???
}

//二数取大
int getmaxnum(int num1,int num2)
{
	int medium;
	medium=num1;
	if(num1<num2){
		medium=num2;
	}
	return(medium);
}

//二分法查找词典,返回匹配标志
int halfsearch(char* src)
{
	int poslow,posmid,poshigh,temp;
	poslow=0;
	poshigh=dictlen -1;
	while(poslow<=poshigh){
	   posmid=(poslow+poshigh)/2;
	   temp=strcmp(src,dictbuf[posmid]);
	   if(temp==0)
	      return 1;
	   else  if(temp>0)
	   {  
	      poslow=posmid+1;
	   }
	   else{
	      poshigh=posmid-1;
	   }
	}
	if(temp>0){
	   return  (-1)*(posmid+2);
	}
	else{
	   return (-1)*(posmid+1);
	}
}

//取句子
void getsentence(char *sentence,char *nontxtbuf)
{
	int sentenceflag,i;
	int bjpunctuflag,bjmarkflag,bjnumflag;
	char wordtemp[3],bjpunctutemp[2],bjmarktemp[2],wordtemp1[3];

	txtflag=0;
	nontxtflag=0;
	punctuflag=0;
	bjpunctuflag=0;
	bjmarkflag=0;
	bjnumflag=0;
	nontxtbuflen=0;
	sentenceflag=(int)sentence;//计算句子长度sentencelen
	for(i=0;i<3;i++){
		wordtemp[i]='\0';
	}
	for(i=0;i<3;i++){
		wordtemp1[i]='\0';
	}
	do{//开始取句子
		wordtemp[0]=fgetc(fsourcetxt);
		if((int)wordtemp[0]>0){  //可能是半角字符
			if((int)wordtemp[0]==10){//是换行回车
				wordtemp[1]=fgetc(fsourcetxt);
				if((int)wordtemp[1]==13){
					wordtemp1[0]=fgetc(fsourcetxt);
					if((int)wordtemp1[0]==32){
						fseek(fsourcetxt,(long)(-1),SEEK_CUR);

					}
				}
				else{
					//sth wrong!???
				}
			}
			else{   //不是换行回车
			nontxtflag=1;
			wordtemp[1]='\0';
			for(i=0;i<bjpunctulen;i++){  //看是否是标点
				bjpunctutemp[0]=bjpunctubuf[i];
				bjpunctutemp[1]='\0';
				if(strcmp(wordtemp,bjpunctutemp)==0){
					*nontxtbuf=wordtemp[0];
					nontxtbuf++;
					nontxtbuflen++;
					*nontxtbuf='|';
					nontxtbuf++;
					nontxtbuflen++;
					*nontxtbuf='\0';
					bjpunctuflag=1;
					break;
				}
			}
			if(bjpunctuflag==1){  //半角标点出现,句子取完
				if(txtflag==1){
					*sentence='\0';
				}
				nontxtflag=1;
				break;
			}
			else{   //若不是标点,看是否是字母或数字。
				for(i=0;i<bjmarklen;i++){
					bjmarktemp[0]=bjmarkbuf[i];
					bjmarktemp[1]='\0';
					if(strcmp(wordtemp,bjmarktemp)==0){
						*nontxtbuf=wordtemp[0];
						nontxtbuf++;
						nontxtbuflen++;
						*nontxtbuf='|';
						nontxtbuf++;
						nontxtbuflen++;
						*nontxtbuf='\0';
						bjmarkflag=1;
						break;
					}
				}
			}
			if(bjmarkflag==1){
				nontxtflag=1;
				if(txtflag==1){
					*sentence='\0';
				}
				break;
			}
			else{   //半角标点和半角字符里都没有该字符
			//	MessageBox("Some DBC case not identified!","Error",MB_ICONERROR);
			}
			}
		}
		else{   //可能是汉字的部分
			txtflag=1;
			wordtemp[1]=fgetc(fsourcetxt);
			if((int)wordtemp[1]>=0){
			//	MessageBox("Error in the file! half word");
			}
			else{
				wordtemp[2]='\0';
				//看是不是标点
				for(i=0;i<punctulen/2;i++){
					punctutmp[0]=punctubuf[i*2];
					punctutmp[1]=punctubuf[i*2+1];
					punctutmp[2]='\0';
					if (strcmp(punctutmp,wordtemp)==0){
						punctuflag=1;
						break;
					}//取到的标点在punctutmp中,以备读入segmenttemp
				}
				//若不是标点,要看是否是全角数字,字符
			}
			if (punctuflag==1){
				*sentence='\0';
				break;
			}
			else{
				*sentence=wordtemp[0];
				sentence++;
				*sentence=wordtemp[1];
				sentence++;
			}
		}
	}while(filepos<filelen);
	//找到标点作为处理标志,句子取完
	sentenlen=(int)sentence-sentenceflag;
}

//以字段为单位处理句子
int field(char *sentence,char *segment)
{
	int (*fieldshortcut),(*ptpdistance)[MAXSENTENCELEN],(*fieldroute)[10];
	char word[17];
	int maxlen,routenum,matchflag,route,sentencepos,wordlen;
	int i,j,k,l,fieldcur,getnum,segmentlen;
	int fieldoffset,fieldleft,fieldlen,fieldside;
	
	fieldshortcut=new int[MAXSENTENCELEN];              //每个字的最短路径,
	ptpdistance=new int[MAXSENTENCELEN][MAXSENTENCELEN];//每个字到其后七个字的匹配矩阵
	fieldroute=new int[MAXSENTENCELEN][10];             //每个字的路径记录,[][0]为最短路径数

	sentencepos=0;
	segmentlen=0;
	//句子未完就继续划分字段
	while(sentencepos<sentenlen){
		fieldoffset=0;   //相对于字段开始的偏移量
		fieldleft=0;     //字段剩余长度
		fieldlen=0;      //字段字数
		fieldside=0;     //字段划分词数
		matchflag=0;

		//初始化为MAXNUM
		for(i=0;i<MAXSENTENCELEN;i++){
			for(j=0;j<10;j++){
				fieldroute[i][j]=MAXNUM;
			}
		}
		fieldshortcut[0]=0;   //注意!第一个为0计算用!!!偏移位置比其它多1
		for(i=1;i<MAXSENTENCELEN;i++){
			fieldshortcut[i]=MAXNUM;
		}
		for(i=0;i<MAXSENTENCELEN;i++){
			for(j=0;j<MAXSENTENCELEN;j++){
				ptpdistance[i][j]=MAXNUM;
			}
		}
		//开始处理句子中的字段
		do{
			//注意:每次只前进一个字!
			wordlen=0;
			maxlen=0;   //最大匹配词长
			routenum=0;
			if((sentenlen-sentencepos)/2>=7){
				getnum=7;
			}   //视剩余字多少取字数
			else{
				getnum=(sentenlen-sentencepos)/2;
			}
			//向前从一字词到七字词进行匹配 !注意序号从0开始
			for(i=0;i<getnum;i++){
				word[wordlen*2+0]=*sentence;
				sentence++;
				word[wordlen*2+1]=*sentence;
				sentence++;
				word[wordlen*2+2]='\0';
				wordlen++;
				matchflag=halfsearch(word);//从词典中找词
				if(matchflag==1){
					ptpdistance[fieldoffset+wordlen-1][fieldoffset]=1;   //匹配成功,矩阵相应位置置1
					maxlen=wordlen;   //当前字向后最大匹配词长
				}
				matchflag=0;
			}//当前字匹配完毕
			if(maxlen==0){
				fieldoffset--;
				break;   //非文本字段!
			}
			//计算当前fieldoffset的最短路径,注意,此处从1开始
			for(i=0;i<=fieldoffset;i++){
				//如果当前的最短路径比已有的短,替代
				if((ptpdistance[fieldoffset][i]+fieldshortcut[i])<fieldshortcut[fieldoffset+1]){
					fieldshortcut[fieldoffset+1]=ptpdistance[fieldoffset][i]+fieldshortcut[i];
					routenum=1;                             //得到最短路径时路径为1
					fieldroute[fieldoffset][routenum]=i;	//得到最短路径时的位置
				}
				else if((ptpdistance[fieldoffset][i]+fieldshortcut[i])==fieldshortcut[fieldoffset+1]){
					routenum++;
					fieldroute[fieldoffset][routenum]=i;    //从0开始计
				}   //不止一条最短路径时,记下位置
			}
			fieldroute[fieldoffset][0]=routenum;            //第一个位置放路径数
			fieldoffset++;                                  //!!!注意:从0开始计
			sentencepos+=2;
			for(i=0;i<(getnum-1)*2;i++){
				sentence--;
			}   //移指针到下一个字
			fieldleft=getmaxnum(maxlen,fieldleft);          //得到fieldleft的值
			fieldleft--;
		}while(fieldleft>0);   //字段取完

		for(i=0;i<fieldoffset*2;i++){
			sentence--;
		}   //退回句首准备切词
		if(fieldoffset==0){
			break;
		}
		else{//准备切词
			fieldlen=fieldoffset;
			fieldside=fieldshortcut[fieldlen];
			routenum=1;
			fieldcur=0;
			k=0;
			route=fieldlen;
			fieldroute[fieldlen][k]=route;
			k++;
			//路径放在fieldroute[fieldlen]里,倒序,中间的字不止一条路径?处理
			for(j=1;j<fieldside;j++){
				fieldroute[fieldlen][k]=fieldroute[route-1][1];
				routenum=routenum*fieldroute[route-1][0];
				route=fieldroute[fieldlen][k];
				k++; 
			}   //一级级找上去,寻找次数为fieldside 
			for(i=(k-1);i>=0;i--){
				for(l=fieldcur;l<(fieldroute[fieldlen][i]*2);l++){
					*segment=*sentence;
					segment++;
					sentence++;
					segmentlen++;
				}
				*segment='|';
				segment++;
				segmentlen++;
				fieldcur=l;
			}   //根据路径记录分词
			if(routenum>1){    //输出超过一条路径的位置
				printf("more route %d\n",sentencepos);
			}
		}   //切完
		if(maxlen==0){
			break;
		}
	}
	*segment='\0';   //写入标点

	delete []ptpdistance;
	delete []fieldroute;
	delete []fieldshortcut;
	return(segmentlen);
}


//处理句子正向最大匹配	
int forward(char *sentencetemp,char *segmentbuf)
{
	int sentencepos,segmentpos,templen,wordlen;
	int i,j,matchflag;
	char sevenword[17];

	sentencepos=0;
	segmentpos=0;
	while(sentencepos<sentenlen){
		//读入一个七字串
		if ((sentenlen-sentencepos)<(WORDLEN-2)){   //不足七个字
			templen=sentenlen-sentencepos;
		}
		else{    //够七个字
			templen=WORDLEN-2;
		}
		for(i=0;i<templen;i++){
			sevenword[i]=*sentencetemp;
			sentencetemp++;
			sentencepos++;
		}
		for(i=templen;i<WORDLEN;i++){
			sevenword[i]='\0';
		}
		sevenword[WORDLEN]='\0';    //读完七字串

		//搜索词典
		matchflag=0;
		wordlen=templen+2;
		for(i=templen/2;i>=1;i--){
			sevenword[i*2]='\0';
			sevenword[i*2+1]='\0';
			wordlen-=2;
			matchflag=halfsearch(sevenword);
			if(matchflag==1){
				for(j=0;j<wordlen;j++){
					*segmentbuf=sevenword[j];
					segmentbuf++;
					segmentpos++;
				}
				*segmentbuf='|';
				segmentbuf++;
				segmentpos++;
				break;
			}
			else{
				sentencetemp--;
				sentencetemp--;
				sentencepos-=2;
			}
		}//匹配完毕
	}
	if (sentenlen==sentencepos){
		*segmentbuf='\0';
	}
	return(segmentpos);
}

//处理句子逆向最大匹配	
int backward(char *sentence,char *backwardbuf)
{
	int sentencepos,segmentpos,segmentlen,templen,segmentwordlen;
	int matchflag,i,k,t;
	char sevenword[17],(*segmentbuf)[16],(*sentencetemp);
	
	segmentbuf=new char[MAXSENTENCELEN][16];
	sentencetemp=new char[MAXSENTENCELEN]; 

	//处理句子
	strcpy(sentencetemp,sentence);
	sentencepos=sentenlen-1;
	segmentpos=0;
	segmentlen=0;
	while(sentencepos>0){
		//读入一个七字串
		if ((sentencepos+1)<(WORDLEN-2)){  //不足七个字
			templen=sentencepos+1;
			for(i=0;i<templen;i++){
				sevenword[i+2]=sentencetemp[sentencepos+1-templen+i];
			}
			sentencepos=sentencepos-templen;
			for(i=0;i<2;i++){
				sevenword[i]='\0';
			}
			for(i=(templen+2);i<=WORDLEN;i++){
				sevenword[i]='\0';
			}
		}
		else{  //够七个字
			templen=WORDLEN-2;
			for(i=(WORDLEN-templen);i<WORDLEN;i++){
				sevenword[i]=sentencetemp[sentencepos+1-WORDLEN+i];
			}
			sentencepos=sentencepos-templen;
			for(i=0;i<(WORDLEN-templen);i++){
				sevenword[i]='\0';
			}
			sevenword[WORDLEN]='\0';
		}//读入一个七字串

		//搜索词典
		matchflag=0;
		for(i=templen/2;i>=1;i--){
			for(t=0;t<templen;t++){
				sevenword[t]=sevenword[t+2];
			}
			sevenword[2*i]='\0';
			sevenword[2*i+1]='\0';
			matchflag=halfsearch(sevenword);
			if(matchflag==1){
				segmentwordlen=strlen(sevenword);
				for(k=0;k<segmentwordlen;k++){
					segmentbuf[segmentpos][k]=sevenword[k];
				}
				segmentbuf[segmentpos][k]='|';
				segmentbuf[segmentpos][k+1]='\0';
				segmentpos++;
				break;
			}
			else{
				sentencepos+=2;
			}

		}//匹配完毕
	}
	strcpy(backwardbuf,segmentbuf[segmentpos-1]);
	for(i=(segmentpos-2);i>=0;i--){
		strcat(backwardbuf,segmentbuf[i]);
	}
	segmentlen=strlen(backwardbuf);
	delete []segmentbuf;
	delete []sentencetemp;
	return(segmentlen);
}

void dealtxtfile(char *dealfilename,char* segfilename)
{
	FILE *fsegment;
	char (*shortestbuf),(*forwardbuf),(*backwardbuf);
	char (*sentencetemp),(*nontxtbuf);
	int shortestbuflen,forwardbuflen,backwardbuflen;

	shortestbuf=new char[MAXSENTENCELEN];
	forwardbuf=new char[MAXSENTENCELEN];
	backwardbuf=new char[MAXSENTENCELEN];
	sentencetemp=new char[MAXSENTENCELEN];
	nontxtbuf=new char[MAXBUFLEN];

	loaddict();

	if ((fsourcetxt=fopen(dealfilename,"r"))==NULL){
		printf("can't open source text file!%s\n");
		exit(0);
	}
	if ((fsegment=fopen(segfilename,"w"))==NULL){
		printf("Fail to creat the segment.txt file!%s\n");
	}
	fseek(fsourcetxt,0L,SEEK_END);
	filelen=ftell(fsourcetxt);
	fseek(fsourcetxt,0L,SEEK_SET);
	filepos=0;

	//处理文本
	while(!feof(fsourcetxt))
	{
		//读入一句话
		getsentence(sentencetemp,nontxtbuf);
		filepos=filepos+sentenlen+2;
		//最短路径法处理这句话
		if(txtflag==1){
			shortestbuflen=field(sentencetemp,shortestbuf);
			//正向最大匹配法处理这句话
			forwardbuflen=forward(sentencetemp,forwardbuf);
			//逆向最大匹配法处理这句话
			backwardbuflen=backward(sentencetemp,backwardbuf);
			//放入切词文本
			if((strcmp(shortestbuf,forwardbuf)==0)&&(strcmp(forwardbuf,backwardbuf)==0)){
				fwrite(shortestbuf,1,shortestbuflen,fsegment);
			}
			else{ //此处需要重点考虑!!!
				fwrite(shortestbuf,1,shortestbuflen,fsegment);	
			}
		}
		if(nontxtflag==1){
			fwrite(nontxtbuf,1,nontxtbuflen,fsegment);
		}
		else if(punctuflag==1){
			fwrite(punctutmp,1,2,fsegment);
		}
		//else  error!!! MessageBox!!!
		if(filepos>=filelen){
			break;
		}
	}
	printf("succeed! c u! \n");
	delete []shortestbuf;
	delete []forwardbuf;
	delete []backwardbuf;
	delete []sentencetemp;
	delete []nontxtbuf;
	fclose(fsourcetxt);
	fclose(fsegment);
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -