⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mysegprogramm.cpp

📁 一个的文本分割程序
💻 CPP
📖 第 1 页 / 共 2 页
字号:
#pragma   warning(disable:4786) 
#include "MySegProgramm.h"
#include"StringTokenizer.h"
#include "time.h"
#include<iostream>
#include<string>
#define conf(w1,w)  ((w1-w)*1.0/w1)
string judgeandseg(List &mylist,SegProgramm seg);
void segstring(ifstream &infile,SegProgramm &seg);
void initial(SegProgramm &seg,string filename);
string findDifference(string str1,string str2);//寻找分词得到的不同结果
bool shouldAddSeg(string character);
SegProgramm::SegProgramm()
{
	dictionary.clear();
	ftempdic.clear();
	rtempdic.clear();
	allsegdic.clear();
}
void SegProgramm::constructDictionary()
{
	if(dictionary.begin()!=dictionary.end())
		dictionary.clear();
	
	ifstream infile("dict.txt");//词典
	string line;
	map<string,int>::iterator iter;
	while(getline(infile,line))
	{
		int pos=line.find_first_of("/");
		string word=line.substr(0,pos);
		string temp=line.substr(pos+1,line.length());
		char *tmp=(char*)temp.c_str();
		int num=atoi(tmp);
		iter=dictionary.find(word);
		if(iter!=dictionary.end())
		{
			iter->second++;
		}
		else
		{
			dictionary.insert(pair<string,int>(word,num));
		}
	}

	infile.close();	
}


string SegProgramm::segSentenceForward(string h,int word)
{
	map<string,int>::iterator iter;	
	int len=h.length();
	
	int flag=0;
	
	string leftString="";
	string rightString="";
	string result="";
	
	while(word*2>len)
	{
		word--;
	}
	
	while(word>1)
	{
		for(int i=0;i<len-word*2+2;i+=2)
		{
			string temp=h.substr(i,word*2);				
			if(isInDictionary(temp))
			{
				flag=1;
				
				string tmp=h.substr(0,i);
				if(tmp!="")
					leftString=leftString+segSentenceForward(tmp,word-1)+"/";
				tmp=h.substr(i+word*2,len-i-word*2);
				if(tmp!="")
					rightString+="/"+segSentenceForward(tmp,word)+rightString;
				
				result=leftString+temp+rightString;
				
				iter=ftempdic.find(temp);
				if(iter!=ftempdic.end())
					iter->second++;
				else
				{
					ftempdic.insert(pair<string,int>(temp,1));
				}
				if(word==2||leftString==""||rightString=="")
					return result;
				if((leftString.find_last_of("/"))!=-1&&(rightString.find_first_of("/"))!=-1)
					return result;
				
			}
			
		}
		if(flag==0)
		{
			word--;
		}
		
	}

	if(word==1)
	{			
		for(int j=0;j<len-2;j+=2)
		{
			string temp=h.substr(j,2);
			result+=temp+"/";
		}
		result+=h.substr(len-2,2);

	}
	
	return result;
}

string SegProgramm::segSentenceReverse(string h,int word)
{
	map<string,int>::iterator iter;
	int len=h.length();
	
	int flag=0;
	
	string leftString="";
	string rightString="";
	string result="";
	while(word*2>len)
	{
		word--;
	}
	while(word>1)
	{
		for(int i=len-word*2;i>=0;i-=2)
		{
			string temp=h.substr(i,word*2);				
			if(isInDictionary(temp))
			{
				flag=1;
				
				string tmp=h.substr(0,i);
				if(tmp!="")
					leftString=leftString+segSentenceReverse(tmp,word)+"/";
				tmp=h.substr(i+word*2,len-i-word*2);
				if(tmp!="")
					rightString+="/"+segSentenceReverse(tmp,word-1)+rightString;
				
				result=leftString+temp+rightString;
//				
//				iter=rtempdic.find(temp);
//				if(iter!=rtempdic.end())
//					iter->second++;
//				else
//				{
//					rtempdic.insert(pair<string,int>(temp,1));
//				}
				if(word==2||leftString==""||rightString=="")
					return result;
				if((leftString.find_last_of("/"))!=-1&&(rightString.find_first_of("/"))!=-1)
					return result;
				
			}
			
		}
		if(flag==0)
		{
			word--;
		}
		
	}

	if(word==1)
	{			
		for(int j=0;j<len-2;j+=2)
		{
			string temp=h.substr(j,2);
			result+=temp+"/";
		}
		result+=h.substr(len-2,2);
	}
	
	return result;
}

int SegProgramm::isInDictionary(string character)
{
	if(dictionary.begin()==dictionary.end())
	{
		cout<<"the dictionary is empty!"<<endl;
		exit(0);
	}
	map<string,int>::iterator iter;
	iter=dictionary.find(character);
	if(iter!=dictionary.end())
		return iter->second;
	return 0;
}

void SegProgramm::printDictionary()
{
	map<string,int>::iterator iter;
	if(dictionary.begin()==dictionary.end())
	{
		cout<<"the dictionary is empty,please initial it!"<<endl;
		exit(0);
	}
	for(iter=dictionary.begin();iter!=dictionary.end();iter++)
	{
		cout<<iter->first<<"/"<<iter->second<<endl;
	}
}

void SegProgramm::printForwardDictionary()
{
	map<string,int>::iterator iter;
	if(ftempdic.begin()==ftempdic.end())
	{
		cout<<"the dictionary of the forward seg_programm is empty,please call SegSentenceFoward"<<endl;
		exit(0);
	}
	for(iter=ftempdic.begin();iter!=ftempdic.end();iter++)
	{
		cout<<iter->first<<"-"<<iter->second<<endl;
	}
}

void SegProgramm::printReverseDictionary()
{
	map<string,int>::iterator iter;
	if(rtempdic.begin()==rtempdic.end())
	{
		cout<<"the dictionary of the reverse seg_programm is empty,please call segSentenceReverse"<<endl;
		exit(0);
	}
	for(iter=rtempdic.begin();iter!=rtempdic.end();iter++)
	{
		cout<<iter->first<<"-"<<iter->second<<endl;
	}
}

int SegProgramm::getFrequenceFromDictionary(string h)
{
	map<string,int>::iterator iter;
	iter=dictionary.find(h);
	int frequence=0;
	if(iter!=dictionary.end())
	{
		frequence=(int)iter->second;
		cout<<iter->first<<"  "<<iter->second<<endl;
	}
	return frequence;
}

int SegProgramm::getFrequenceFromFtempdic(string h)
{
	map<string,int>::iterator iter;
	iter=ftempdic.find(h);
	if(iter!=ftempdic.end())
	{
		int tmp=(int)iter->second;
		return tmp;
	}
	return 0;
}

int SegProgramm::getFrequenceFromRtempdic(string h)
{
	map<string,int>::iterator iter;
	iter=rtempdic.find(h);
	if(iter!=rtempdic.end())
		return iter->second;
	return 0;
}

int SegProgramm::judge(string str1,string str2)
{
	StringTokenizer seg1(str1,"/");
	StringTokenizer seg2(str2,"/");
	
	int length=0;
	int frequence=0;

	double weight1=0.0,weight2=0.0;
	for(int i=0;i<seg1.getSize();i++)
	{
		length=seg1.getTokenLength(i);
		//frequence=this->getFrequenceFromFtempdic(seg1.getToken(i));
		frequence=this->getFrequenceFromDictionary(seg1.getToken(i));
		weight1+=length*frequence;
	}
	for(int j=0;j<seg2.getSize();j++)
	{
		length=seg2.getTokenLength(j);
		//frequence=getFrequenceFromRtempdic(seg2.getToken(j));
		frequence=this->getFrequenceFromDictionary(seg2.getToken(j));
		weight2+=length*frequence;
	}
	
	return weight1>weight2?1:2;
}
void SegProgramm::InitialAllsegdic(string filename)//quan且分的函数 
{
	ifstream infile(filename.c_str());
	
	List	tmplist;

	string line="";
	while(getline(infile,line))
	{
		if(line.length()==0)
			continue;
		this->initiaSeg(line,tmplist);
		while(!tmplist.isempty())
		{
			Word *temp=tmplist.getFromHead();
			if((temp->flag==13)||(temp->flag==23))
				allSeg(temp->character);
		}
		
	}
	
	filter();
//	printAllsegDictionary();
	truth_filter();
	cout<<"-----------------------------------------"<<endl;
	printAllsegDictionary();
	infile.close();
}

void SegProgramm::printAllsegDictionary()//打印上面的词典
{
	map<string,int>::iterator iter;
	for(iter=allsegdic.begin();iter!=allsegdic.end();iter++)
	{
		cout<<iter->first<<"/ "<<iter->second<<endl;
	}
}
void SegProgramm::allSeg(string str,int sizeflag)//全且分函数,目前需要更新的是,对一句话,仅对非标点符号进行全切分
{
	map<string,int>::iterator iter;
	int len=str.length();
	string temp="";
	while(sizeflag*2>len)
	{
		sizeflag--;
	}
	int i=0;
	for(;i<len-sizeflag*2+2;i+=2)
	{
		for(int j=2;j<sizeflag+1;j++)
		{
			temp=str.substr(i,j*2);
			iter=allsegdic.find(temp);
			if(iter==allsegdic.end())
				allsegdic.insert(pair<string,int>(temp,1));
			else
				iter->second++;
		}
	}
	i=i;
	if(sizeflag>2)
	{
		allSeg(str.substr(i,2*sizeflag),sizeflag-1);
	}
}

bool BeContained(string w1,string w)
{
	bool flag=true;
	int length1=0,length2=0;
	length1=w1.length();
	length2=w.length();
	for(int i=0;i<length1;i+=2)
	{
		if(w1.substr(i,2).compare(w.substr(i,2))!=0)
		{
			flag=false;
			break;
		}
	}
	return flag;
}

void SegProgramm::truth_filter()
{
	string w1="",w="";
	int fre_w1=0,fre_w=0;
	map<string,int>::iterator iter,iter1;
	map<string,int> tempdic;
//	for(iter=allsegdic.begin();iter!=allsegdic.end();iter++)
//	{
//		for(iter1=allsegdic.begin();iter1!=allsegdic.end();iter1++)
//		{
//			if(iter1!=iter)
//			{
//				if(BeContained(iter->first,iter1->first))
//				{
//					float con=conf(iter->second,iter1->second);
//					if(con<0.2)
//					{
//						allsegdic.erase(iter1);
//						iter1=allsegdic.begin();
//					}
//					else if(con>0.9)
//					{
//						allsegdic.erase(iter);
//						iter=allsegdic.begin();
//					}
//					else
//					{
//						allsegdic.erase(iter);
//						allsegdic.erase(iter1);
//						iter=iter1=allsegdic.begin();
//					}
//				}
//			}
//		}
// 	}
	for(iter=allsegdic.begin();iter!=allsegdic.end();)
	{
		w1=iter->first;
		fre_w1=iter->second;
		iter1=++iter;
		w="";
		if(iter1!=allsegdic.end())
		{

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -