📄 mysegprogramm.cpp

📁 一个的文本分割程序
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
			w=iter1->first;
			fre_w=iter1->second;
			if(BeContained(w1,w))
			{
				float con=conf(fre_w1,fre_w);
				if(con<0.2)
				{
					allsegdic.erase(allsegdic.find(w1));
					iter=allsegdic.begin();
				}
				else if(con>0.9)
				{
					allsegdic.erase(allsegdic.find(w));
					iter=allsegdic.begin();
				}
				else
				{
					allsegdic.erase(allsegdic.find(w1));
					allsegdic.erase(allsegdic.find(w));
					iter=allsegdic.begin();
				}
			}
		}
 	}
	
	for(iter=allsegdic.begin();iter!=allsegdic.end();iter++)
	{
		w1=iter->first;
		int len=w1.length();
		for(int k=0;k<len;k+=2)
			if((w1.substr(k,2).compare("的")==0)||(w1.substr(k,2).compare("地")==0)||(w1.substr(k,2).compare("以")==0))
			{
				allsegdic.erase(iter);
				iter=allsegdic.begin();
				break;
			}	
	}
	for(iter=tempdic.begin();iter!=tempdic.end();iter++)
	{
		cout<<"temp:"<<iter->first<<"/"<<iter->second<<endl;
	}
//	exit(0);
}

void SegProgramm::filter()//过滤器，目前只能过滤频率低的，需要继续完善，删除一些奇怪的词语。
{
	map<string,int> tempDic;
	tempDic.clear();
	if(allsegdic.begin()==allsegdic.end())
	{
		cout<<"the dictionary is empty!(all segmentation dictionary)"<<endl;
	}
	else
	{
		map<string,int>::iterator iter;
		for(iter=allsegdic.begin();iter!=allsegdic.end();iter++)
		{
			int tmp=iter->second;
			if(tmp>2)
			{
				tempDic.insert(pair<string,int>(iter->first,iter->second));
			}
		}

		allsegdic.clear();
		allsegdic=tempDic;
	}
	
	
}

int SegProgramm::judgeStringisWord(string str)//这个函数，根据GB2312编码规则来一个item是否是一个汉语词条
{											  //对于一字节编码的词，21代表为数字，22代表为标点，23代表其他（如：英文）。
											  //对于二字节编码的词，11代表数字，12代表标点，13代表汉字
	short int punctLow=(short)0xa1a0,punctUp=(short)0xa3b0;
	short int punctLow1=(short)0xa3b9,punctUp1=(short)0xa3fe;
	short int numLow=(short)0xa3af,numUp=(short)0xa3ba;

   	char ASCIInumLow=0x30,ASCIInumUp=0x39;
    char ASCIIpunctLow=0x21,ASCIIpunctUp=0x2f;  
	char ASCIIpunctLow1=0x3a,ASCIIpunctUp1=0x7e;    
	char *pstr=(char*)str.c_str();
	if(str.length()<=0)
		return -1;
	if(pstr[0]>0)
	{
		if(ASCIInumLow<=pstr[0]&&pstr[0]<=ASCIInumUp)
			{
				return 21;
			}
			else if((ASCIIpunctLow<=pstr[0]&&pstr[0]<=ASCIIpunctUp)||(ASCIIpunctLow1<=pstr[0]&&pstr[0]<=ASCIIpunctUp1))
			{
				return 22;
			}
			else
			{
				return 23;
			}		
	}
	else
	{
		short int tmp=(pstr[0]<<8)|(0xff&pstr[1]);
		if(numLow<tmp&&tmp<numUp)
		{
			return 11;//数字的标记
		}
		else if((punctLow<tmp&&tmp<punctUp)||(punctLow1<tmp&&tmp<punctUp1))
		{
			return 12;	//标点的标记	
		}
		else
		{
			return 13;//汉语
		}
	}
	return 0;
}


void SegProgramm::initiaSeg(string str1,List &mylist)//此函数，将一句话，分为，汉字，数字，和标点，
{										//中文分词仅对汉字进行。将结果按原据的词语，存到一个链表中
						
	char *temp=(char*)str1.c_str();
	short int punctLow=(short)0xa1a0,punctUp=(short)0xa3b0;
	short int punctLow1=(short)0xa3b9,punctUp1=(short)0xa3fe;
	short int numLow=(short)0xa3af,numUp=(short)0xa3ba;
	short int point1=(short)0xa1aa,point2=(short)0xa3ad,point3=0xa1a4,point4=0xa3ae,point_per=0xa3a5;

   	char ASCIInumLow=0x30,ASCIInumUp=0x39;
    char ASCIIpunctLow=0x21,ASCIIpunctUp=0x2f;  
	char ASCIIpunctLow1=0x3a,ASCIIpunctUp1=0x7e;  
	int start=0,stop=0;
	int label[1000];
	for(int k=0;k<1000;k++)
		label[k]=0;
    int j=0;
	label[j++]=0;
    int flag=1,lflag=1;
	for(int i=0;i<str1.length();)
	{	

		if(temp[i]<0)//应该合并，为二字节编码
		{	
			short int tmp=(temp[i]<<8)|(0xff&temp[i+1]);
			if(numLow<tmp&&tmp<numUp)
			{
				flag=11;
				if((i>1)&&((((temp[i-2]<<8)|(0xff&temp[i-1]))==point4)||temp[i-1]=='.'))
					lflag=flag;
				if(lflag!=flag)
				{
					label[j++]=i;
				}
				lflag=flag;
			}
			else if((punctLow<tmp&&tmp<punctUp)||(punctLow1<tmp&&tmp<punctUp1))
			{//一般情况下标点符号均需要被分开
				flag=12;
				if((lflag==12)&&((tmp==point1)||(tmp==point2)||(tmp==point3)))
					lflag=flag;
				else if((lflag==11)&&((tmp==point3)||(tmp==point4)||(tmp==point_per)))
				{
					lflag=flag;
				}
				else
				{
					lflag=flag+1;
				}
  				if(lflag!=flag)
					label[j++]=i;
				lflag=flag;
			}
			else
			{
				flag=13;
				if(lflag!=flag)
				{
					label[j++]=i;
				}

				lflag=flag;			
			}
		    i=i+2;
		}
		else//非二字节编码，为ASCII编码
		{


			if(ASCIInumLow<=temp[i]&&temp[i]<=ASCIInumUp)
			{
				flag=21;
				if(lflag==22&&((temp[i-1]==0x3a)|(temp[i-1]=='.')))
					lflag=flag;
				if(lflag!=flag)
				{
					label[j++]=i;
				}
				lflag=flag;
			}
			else if((ASCIIpunctLow<=temp[i]&&temp[i]<=ASCIIpunctUp)||(ASCIIpunctLow1<=temp[i]&&temp[i]<ASCIIpunctUp1))
			{
				flag=22;
				if((lflag==21||lflag==11)&&((temp[i]==0x3a)|(temp[i]=='.')))
					lflag=flag;
				if(lflag!=flag)
				{
					label[j++]=i;
				}
				lflag=flag;
			}
			else
			{
				flag=23;//其实是不可能执行到的
				if(lflag!=flag)
				{
					label[j++]=i;
				}

				lflag=flag;
			}
			i=i+1;
		}
				
	}
	
	mylist.emptyList();

	label[j++]=str1.length();
	int pos=0;
	for(int m=0;m<j-1;m++)
	{
 		string temp=str1.substr(label[m],label[m+1]-label[m]);
		if(temp.length()==0)
			continue;
		Word *nod=new Word;
		nod->character=temp;
	    nod->flag=judgeStringisWord(nod->character);
		mylist.addFromTail(nod);
	}
	str1.empty();
}


void SegProgramm::textToDict(string filename)//此函数中调用其他函数，完成分词功能。输出的文件也在此种。
{
	ifstream infile(filename.c_str());
	ofstream outfile("result1.txt");
	
	List mylist;
	SegProgramm myseg;
    
//	myseg.InitialAllsegdic(filename);//这两行  函数 是对一句话进行权且分的，处理未登录词的。

	myseg.constructDictionary();///

	string line="";
	int i=0;
	Word *temp;
	string result_last;	
	int lflag=11;
	while(getline(infile,line))
	{   
		if(line.length()==0)
			continue;
		string tmplinefwd="";
		string tmplinerev="",result="";
		result_last="";
		this->initiaSeg(line,mylist);	//一行进行划分，汉字和数字和标点

		while(!mylist.isempty())
		{
	
			temp=mylist.getFromHead();
			if((temp->flag==13)||(temp->flag==23))//如果是汉字23的标记是不可能出现的
			{
				string tmp1="",tmp2="";
				tmp1=myseg.segSentenceForward(temp->character);//分别对汉字调用最大前切分和逆向切分
				tmp2=myseg.segSentenceReverse(temp->character);
				tmplinefwd+=tmp1;
				tmplinerev+=tmp2;
				if(tmp1.compare(tmp2)!=0)//如果二者的切分结果不一样
				{
					result=findDifference(tmp1,tmp2);//找出不一样的地方
					cout<<"Difference:"<<result<<endl;//输出不一样的地方
					StringTokenizer diff(result,"|");//对不一样的二者用|隔开的
					if(myseg.judge(diff.getToken(0),diff.getToken(1))==1)//如果采用判断权重函数得到结果为1，表示取正向最大匹配的结果
					{
						if((lflag==11)||(lflag==21))
						{
							if(shouldAddSeg(temp->character))//如果汉字开头是 年 月 日 时 分 秒 等词，一般的他的前面是一个数字，年月日这些词和数字不分开
								result_last=result_last+"/"+tmp1;//不满足上述条件，分开
							else
								result_last=result_last+tmp1;//满足上述条件。
						}
						else
						{
							result_last=result_last+tmp1;
						}
					}					
					else//加权后 取逆向最大匹配的结果
					{
						if((lflag==11)||(lflag==21))//同上
						{
							if(shouldAddSeg(temp->character))
								result_last=result_last+"/"+tmp2;
							else
								result_last=result_last+tmp2;
						}
						else
						{
							result_last=result_last+tmp2;
						}
					}
						
				}
				else//二者分出的结果是一样的，随便取一个就可，此处去正向最大匹配的
				{
					if((lflag==11)||(lflag==21))
						{
							if(shouldAddSeg(temp->character))
								result_last=result_last+"/"+tmp1;
							else
								result_last=result_last+tmp1;
						}
						else
						{
							result_last=result_last+tmp1;
						}
				}
				lflag=temp->flag;//lflag是记录上一个处理的单元 是 数字 标点 还是是汉字
			}
			else if((temp->flag==11)||(temp->flag==21))//当前处理单元为数字
			{
				if((lflag==12)||(lflag==22))// 当前处理单元的前一个单元为标点，则认为标点和数字在一块。
				{
					tmplinefwd=tmplinefwd+"/"+temp->character;
					tmplinerev=tmplinerev+"/"+temp->character;
					result_last=result_last+temp->character;//不处理
				}
				else//前一个单元不是标点（是汉字，则分开）
				{
					tmplinefwd=tmplinefwd+"/"+temp->character;
					tmplinerev=tmplinerev+"/"+temp->character;
					result_last=result_last+"/"+temp->character;
				}
				lflag=temp->flag;//保存当前处理的标记（汉字 数字 标点）
			}
			else//当前处理单元是标点
			{
				if((lflag==11)||(lflag==21))//前一个处理单元是标点（22，12）或者数字（11，21）
				{
					tmplinefwd=tmplinefwd+"/"+temp->character;
					tmplinerev=tmplinerev+"/"+temp->character;
					result_last=result_last+"/"+temp->character+"/";			
				}
				else if ((lflag==22)||(lflag==12))
				{
					tmplinefwd=tmplinefwd+"/"+temp->character;
					tmplinerev=tmplinerev+"/"+temp->character;
					result_last=result_last+temp->character+"/";
				}
				else
				{
					tmplinefwd=tmplinefwd+"/"+temp->character;
					tmplinerev=tmplinerev+"/"+temp->character;
					result_last=result_last+"/"+temp->character+"/";
				}
				lflag=temp->flag;

			}
			
		}
	
		cout<<"Seg result:"<<result_last<<endl;
		outfile<<result_last<<endl;
	}	
	
	infile.close();
	outfile.close();
//	myseg.printForwardDictionary();
// 	myseg.printReverseDictionary();
		
}





string findDifference(string str1,string str2)//寻找分词得到的不同结果
{
	StringTokenizer token1(str1,"/");
	StringTokenizer token2(str2,"/");
	int len1=token1.getSize();
	int len2=token2.getSize();
	
	string result1="",result2="";
	
	for(int i=0;i<len1;i++)
	{
		if(token1.getToken(i).compare(token2.getToken(i))!=0)
		{
			result1=result1+"/"+token1.getToken(i);//将不同处的str1 词语之间用“/”隔开
			result2=result2+"/"+token2.getToken(i);
		}
	}

	return result1+"|"+result2;//两者之间用空格隔开，返回后有用。
		
}




bool shouldAddSeg(string character)
{
	bool flag=true;
	string temp=character.substr(0,2);
	if((temp.compare("年")==0)||(temp.compare("月")==0)||(temp.compare("日")==0)||(temp.compare("时")==0)||(temp.compare("分")==0)||(temp.compare("点")==0))
	{
		flag=false;
	}
    if(character.length()>3)
		temp=character.substr(0,4);
	if((temp.compare("年份")==0)||(temp.compare("月份")==0)||(temp.compare("时间")==0)/*||(temp.compare("年来")==0)*/||(temp.compare("年中")==0))
		flag=true;
	return flag;
}
上一页 12
💿 文件大小 3543 K
👤 上传用户 SLing2008
📂 所属分类多国语言处理
🏷️ 相关标签

#分割 #程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -