📄 mysegprogramm.cpp
字号:
#pragma warning(disable:4786)
#include "MySegProgramm.h"
#include"StringTokenizer.h"
#include "time.h"
#include<iostream>
#include<string>
#define conf(w1,w) ((w1-w)*1.0/w1)
string judgeandseg(List &mylist,SegProgramm seg);
void segstring(ifstream &infile,SegProgramm &seg);
void initial(SegProgramm &seg,string filename);
string findDifference(string str1,string str2);//寻找分词得到的不同结果
bool shouldAddSeg(string character);
SegProgramm::SegProgramm()
{
dictionary.clear();
ftempdic.clear();
rtempdic.clear();
allsegdic.clear();
}
void SegProgramm::constructDictionary()
{
if(dictionary.begin()!=dictionary.end())
dictionary.clear();
ifstream infile("dict.txt");//词典
string line;
map<string,int>::iterator iter;
while(getline(infile,line))
{
int pos=line.find_first_of("/");
string word=line.substr(0,pos);
string temp=line.substr(pos+1,line.length());
char *tmp=(char*)temp.c_str();
int num=atoi(tmp);
iter=dictionary.find(word);
if(iter!=dictionary.end())
{
iter->second++;
}
else
{
dictionary.insert(pair<string,int>(word,num));
}
}
infile.close();
}
string SegProgramm::segSentenceForward(string h,int word)
{
map<string,int>::iterator iter;
int len=h.length();
int flag=0;
string leftString="";
string rightString="";
string result="";
while(word*2>len)
{
word--;
}
while(word>1)
{
for(int i=0;i<len-word*2+2;i+=2)
{
string temp=h.substr(i,word*2);
if(isInDictionary(temp))
{
flag=1;
string tmp=h.substr(0,i);
if(tmp!="")
leftString=leftString+segSentenceForward(tmp,word-1)+"/";
tmp=h.substr(i+word*2,len-i-word*2);
if(tmp!="")
rightString+="/"+segSentenceForward(tmp,word)+rightString;
result=leftString+temp+rightString;
iter=ftempdic.find(temp);
if(iter!=ftempdic.end())
iter->second++;
else
{
ftempdic.insert(pair<string,int>(temp,1));
}
if(word==2||leftString==""||rightString=="")
return result;
if((leftString.find_last_of("/"))!=-1&&(rightString.find_first_of("/"))!=-1)
return result;
}
}
if(flag==0)
{
word--;
}
}
if(word==1)
{
for(int j=0;j<len-2;j+=2)
{
string temp=h.substr(j,2);
result+=temp+"/";
}
result+=h.substr(len-2,2);
}
return result;
}
string SegProgramm::segSentenceReverse(string h,int word)
{
map<string,int>::iterator iter;
int len=h.length();
int flag=0;
string leftString="";
string rightString="";
string result="";
while(word*2>len)
{
word--;
}
while(word>1)
{
for(int i=len-word*2;i>=0;i-=2)
{
string temp=h.substr(i,word*2);
if(isInDictionary(temp))
{
flag=1;
string tmp=h.substr(0,i);
if(tmp!="")
leftString=leftString+segSentenceReverse(tmp,word)+"/";
tmp=h.substr(i+word*2,len-i-word*2);
if(tmp!="")
rightString+="/"+segSentenceReverse(tmp,word-1)+rightString;
result=leftString+temp+rightString;
//
// iter=rtempdic.find(temp);
// if(iter!=rtempdic.end())
// iter->second++;
// else
// {
// rtempdic.insert(pair<string,int>(temp,1));
// }
if(word==2||leftString==""||rightString=="")
return result;
if((leftString.find_last_of("/"))!=-1&&(rightString.find_first_of("/"))!=-1)
return result;
}
}
if(flag==0)
{
word--;
}
}
if(word==1)
{
for(int j=0;j<len-2;j+=2)
{
string temp=h.substr(j,2);
result+=temp+"/";
}
result+=h.substr(len-2,2);
}
return result;
}
int SegProgramm::isInDictionary(string character)
{
if(dictionary.begin()==dictionary.end())
{
cout<<"the dictionary is empty!"<<endl;
exit(0);
}
map<string,int>::iterator iter;
iter=dictionary.find(character);
if(iter!=dictionary.end())
return iter->second;
return 0;
}
void SegProgramm::printDictionary()
{
map<string,int>::iterator iter;
if(dictionary.begin()==dictionary.end())
{
cout<<"the dictionary is empty,please initial it!"<<endl;
exit(0);
}
for(iter=dictionary.begin();iter!=dictionary.end();iter++)
{
cout<<iter->first<<"/"<<iter->second<<endl;
}
}
void SegProgramm::printForwardDictionary()
{
map<string,int>::iterator iter;
if(ftempdic.begin()==ftempdic.end())
{
cout<<"the dictionary of the forward seg_programm is empty,please call SegSentenceFoward"<<endl;
exit(0);
}
for(iter=ftempdic.begin();iter!=ftempdic.end();iter++)
{
cout<<iter->first<<"-"<<iter->second<<endl;
}
}
void SegProgramm::printReverseDictionary()
{
map<string,int>::iterator iter;
if(rtempdic.begin()==rtempdic.end())
{
cout<<"the dictionary of the reverse seg_programm is empty,please call segSentenceReverse"<<endl;
exit(0);
}
for(iter=rtempdic.begin();iter!=rtempdic.end();iter++)
{
cout<<iter->first<<"-"<<iter->second<<endl;
}
}
int SegProgramm::getFrequenceFromDictionary(string h)
{
map<string,int>::iterator iter;
iter=dictionary.find(h);
int frequence=0;
if(iter!=dictionary.end())
{
frequence=(int)iter->second;
cout<<iter->first<<" "<<iter->second<<endl;
}
return frequence;
}
int SegProgramm::getFrequenceFromFtempdic(string h)
{
map<string,int>::iterator iter;
iter=ftempdic.find(h);
if(iter!=ftempdic.end())
{
int tmp=(int)iter->second;
return tmp;
}
return 0;
}
int SegProgramm::getFrequenceFromRtempdic(string h)
{
map<string,int>::iterator iter;
iter=rtempdic.find(h);
if(iter!=rtempdic.end())
return iter->second;
return 0;
}
int SegProgramm::judge(string str1,string str2)
{
StringTokenizer seg1(str1,"/");
StringTokenizer seg2(str2,"/");
int length=0;
int frequence=0;
double weight1=0.0,weight2=0.0;
for(int i=0;i<seg1.getSize();i++)
{
length=seg1.getTokenLength(i);
//frequence=this->getFrequenceFromFtempdic(seg1.getToken(i));
frequence=this->getFrequenceFromDictionary(seg1.getToken(i));
weight1+=length*frequence;
}
for(int j=0;j<seg2.getSize();j++)
{
length=seg2.getTokenLength(j);
//frequence=getFrequenceFromRtempdic(seg2.getToken(j));
frequence=this->getFrequenceFromDictionary(seg2.getToken(j));
weight2+=length*frequence;
}
return weight1>weight2?1:2;
}
void SegProgramm::InitialAllsegdic(string filename)//quan且分的函数
{
ifstream infile(filename.c_str());
List tmplist;
string line="";
while(getline(infile,line))
{
if(line.length()==0)
continue;
this->initiaSeg(line,tmplist);
while(!tmplist.isempty())
{
Word *temp=tmplist.getFromHead();
if((temp->flag==13)||(temp->flag==23))
allSeg(temp->character);
}
}
filter();
// printAllsegDictionary();
truth_filter();
cout<<"-----------------------------------------"<<endl;
printAllsegDictionary();
infile.close();
}
void SegProgramm::printAllsegDictionary()//打印上面的词典
{
map<string,int>::iterator iter;
for(iter=allsegdic.begin();iter!=allsegdic.end();iter++)
{
cout<<iter->first<<"/ "<<iter->second<<endl;
}
}
void SegProgramm::allSeg(string str,int sizeflag)//全且分函数,目前需要更新的是,对一句话,仅对非标点符号进行全切分
{
map<string,int>::iterator iter;
int len=str.length();
string temp="";
while(sizeflag*2>len)
{
sizeflag--;
}
int i=0;
for(;i<len-sizeflag*2+2;i+=2)
{
for(int j=2;j<sizeflag+1;j++)
{
temp=str.substr(i,j*2);
iter=allsegdic.find(temp);
if(iter==allsegdic.end())
allsegdic.insert(pair<string,int>(temp,1));
else
iter->second++;
}
}
i=i;
if(sizeflag>2)
{
allSeg(str.substr(i,2*sizeflag),sizeflag-1);
}
}
bool BeContained(string w1,string w)
{
bool flag=true;
int length1=0,length2=0;
length1=w1.length();
length2=w.length();
for(int i=0;i<length1;i+=2)
{
if(w1.substr(i,2).compare(w.substr(i,2))!=0)
{
flag=false;
break;
}
}
return flag;
}
void SegProgramm::truth_filter()
{
string w1="",w="";
int fre_w1=0,fre_w=0;
map<string,int>::iterator iter,iter1;
map<string,int> tempdic;
// for(iter=allsegdic.begin();iter!=allsegdic.end();iter++)
// {
// for(iter1=allsegdic.begin();iter1!=allsegdic.end();iter1++)
// {
// if(iter1!=iter)
// {
// if(BeContained(iter->first,iter1->first))
// {
// float con=conf(iter->second,iter1->second);
// if(con<0.2)
// {
// allsegdic.erase(iter1);
// iter1=allsegdic.begin();
// }
// else if(con>0.9)
// {
// allsegdic.erase(iter);
// iter=allsegdic.begin();
// }
// else
// {
// allsegdic.erase(iter);
// allsegdic.erase(iter1);
// iter=iter1=allsegdic.begin();
// }
// }
// }
// }
// }
for(iter=allsegdic.begin();iter!=allsegdic.end();)
{
w1=iter->first;
fre_w1=iter->second;
iter1=++iter;
w="";
if(iter1!=allsegdic.end())
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -