⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 wjc.cpp

📁 通过隐马尔可夫模型做词性标注,即给定一个句子,得到它的最优词性序列
💻 CPP
字号:
#pragma   warning(disable:   4786)
#include <iostream>
#include <fstream>
#include <cstdlib>
#include <vector>
#include <string>
#include <map>
using namespace std;
map <string,int>word_index;
map <string,int>pos_index;
vector <string> dict_word ;
vector <string> test_word ;
vector <string> test_pos ;
vector <string> dict_pos ;  //cunfang zidian zhong de suoyou cixing
int dict_word_size,dict_pos_size;
#define ping_hua 0.1;
struct pos_node
{
	int pos_num;
	float pos_pro;
};

void read_dict()
{
	string s1,s_pos,s_word,s;
	ifstream fin;
	fin.open("dict.txt");
	if(!fin)
	{
		cerr<<"error 100 opening dict.txt";
		exit(100);
	}
	int index_word=0,index_pos=0;
	while(getline(fin,s1))
	{
		int k=0;
		for(int i=0;i<s1.length();i++)
		{
			if(s1.compare(i,1,"\t"))
				k++;
			else 
			{	
				s_pos=s1.substr(k+1);     //提取位置K+1后的子串,cixing
                break;
			}
		}
		s_word=s1.substr(0,k);   //提取汉字词到TEMP
        dict_word .push_back(s_word);
		word_index[string(s_word)]=index_word;
        index_word++;


        for(int j=0;j<s_pos.length();j++)
		{
			if(s_pos.compare(j,1," ")&&s_pos.compare(j,1,"\t"))
			{
				if(j+1<s_pos.length())
				{
					if(!s_pos.compare(j+1,1," ")||!s_pos.compare(j+1,1,"\t"))  //该词性只有一个字母组成
						s=s_pos.substr(j,1); //提取一个字符

					else
						s=s_pos.substr(j,2);   //提取一个词性,两个字

				}
				else
					s=s_pos.substr(j,1);   //提取一个字符
			}
			if(!pos_index.count(s))  //mei zhaodao ,charu
			{
				pos_index[s]=index_pos;
				index_pos++;
				dict_pos.push_back(s);
			}
		}
	}

    dict_word_size=word_index.size();
	dict_pos_size=pos_index.size();

	for(int i=0;i<pos_index.size();i++)
		cout<<pos_index[dict_pos[i]]<<"  "<<dict_pos[i]<<endl;

	fin.close();
    
}

void count_pi_A_B(float **pos_to_pos,float **pos_to_word)
{
	string s1,s2,s_fir_pos,s_sce_pos,s_end_pos,s_word,fir_str,sce_str,end_str,end_pre_pos;
	int k=0,fir_null,sce_null,end_null,fir=1,from,to;

	ifstream fin1("corpus.txt");
	if(!fin1)
	{
		cerr<<"error 100 opening dict.txt";
		exit(100);
	}
	while(getline(fin1,s2))
	{
		if(s2.length()!=0)
		{
			end_null=s2.rfind("  ");
			if(fir!=1)//shangyihang de zuihou yige zhuandao xia yi hang de diyige
			{
					fir_null=s2.find("  "); 
					fir_str=s2.substr(0,fir_null);
					k=fir_str.find_first_of('/');                 
					s_fir_pos=fir_str.substr(k+1,2);
					from=pos_index[s_end_pos];	  
					to=pos_index[s_fir_pos]+1;
					pos_to_pos[from][to]++;    //di yi lie fang pi
			}
			fir_null=s2.find("  "); 
			fir_str=s2.substr(0,fir_null);
			k=fir_str.find_first_of('/'); 
			s_word=fir_str.substr(0,k);   //tiqu  yige ci,  qiu b.........
			s_fir_pos=fir_str.substr(k+1,2); //tiqu chu yige cixing
			from=word_index[s_word];   //xuoying ,dedao b
			to=pos_index[s_fir_pos];
			pos_to_word[from][to]++;

			pos_to_pos[pos_index[s_fir_pos]][0]++;//qiu pi
			while(fir_null!=end_null)
			{
				sce_null=s2.find("  ",fir_null+2);
				sce_str=s2.substr(fir_null+2,sce_null-fir_null-2);   //tiqu dierge chuan

				k=sce_str.find_first_of('/');                 
				s_word=sce_str.substr(0,k);   //tiqu chu yige ci   qiu b.................
				s_sce_pos=sce_str.substr(k+1,2); //tiqu chu yige cixing
				from=word_index[s_word];   //xuoying ,dedao b
				to=pos_index[s_sce_pos];
				pos_to_word[from][to]++;

				from=pos_index[s_fir_pos];
				to=pos_index[s_sce_pos];
				pos_to_pos[from][to+1]++;    //diyilie fang pi,  a
			


				fir_null=sce_null;
				s_fir_pos=s_sce_pos;
			}
			s_end_pos=s_sce_pos;//jilu xia shangyihang zuihou yige cixing 
		}

		fir=0;
	}
}

void pinghua(float **pos_to_pos,float **pos_to_word)
{
	int i,j;
	float all_pi=0,all_a=0,all_b=0,ph,size;
	ph=(float)ping_hua;
	for(i=0;i<dict_pos_size;i++)   //pi_pro
	{
		pos_to_pos[i][0]=pos_to_pos[i][0]+ph;
		all_pi=all_pi+pos_to_pos[i][0];
	}
	for(i=0;i<dict_pos_size;i++)
	{
		pos_to_pos[i][0]=pos_to_pos[i][0]/all_pi;
	}


	for(i=0;i<dict_pos_size;i++)   //a_pro   xianxing chazhi 
	{	
		all_a=0;
		for(j=1;j<=dict_pos_size;j++)
		    all_a=pos_to_pos[i][j]+all_a;
		for(j=1;j<=dict_pos_size;j++)
			pos_to_pos[i][j]=((1-ph)*pos_to_pos[i][j]/all_a)+ph;
	}

	
	for(j=0;j<dict_word_size;j++)//b_pro
	{
		size=0;
		all_b=0;
		for(i=0;i<dict_pos_size;i++)  
		{
			all_b=pos_to_word[j][i]+all_b;  //c(word[j])
			if(pos_to_word[j][i]!=0)
				size++;
		}

		for(i=0;i<dict_pos_size;i++) 
			pos_to_word[j][i]= (float)(pos_to_word[j][i]+1)/(all_b+size);
	}
}

float max(float **i_j_pro,float **pos_to_pos,int i,int j,int &rec_k)
{
	float max=0;
	for(int k=0;k<dict_pos.size();k++)
	{
		if(i_j_pro[i][k]*pos_to_pos[k][j+1]>max)
		{
			max=i_j_pro[i][k]*pos_to_pos[k][j+1];
			rec_k=k;
		}
	}
	return max;
}
void viterbi(float **pos_to_pos,float **pos_to_word,float **i_j_pro,string **most_state,string *new_state)
{
	int i,j,index,rec_k=0;
	float mmax;
	string state;
	if(word_index.count(test_word[0]))    //ci yizai yuliaoku zhong chuxian
	{
		index=word_index[test_word[0]];
	    for(j=0;j<dict_pos_size;j++) //pi
			i_j_pro[0][j]=pos_to_pos[j][0]*pos_to_word[index][j];  //pi[j]*...
	}
	else  //gaici wei wei denglu ci  ,renwei meige cixing changsheng gaici de gailv dou yiyang 
	{
		 for(j=0;j<dict_pos_size;j++) //pi   
			 i_j_pro[0][j]=pos_to_pos[j][0]; 
	}
///////////////////////////////////////////////////
	for(i=1;i<test_word.size();i++)  //ci de gesh
	{
		if(word_index.count(test_word[i]))
		{
			index=word_index[test_word[i]];
			for(j=0;j<dict_pos_size;j++) //pi
			{
				mmax=max(i_j_pro,pos_to_pos,i-1,j,rec_k);
				i_j_pro[i][j]=mmax*pos_to_word[index][j];
				most_state[i][j]=dict_pos[rec_k];
			}
		}
		else     //gaici wei wei denglu ci
		{
			for(j=0;j<dict_pos_size;j++) //pi
			{
				mmax=max(i_j_pro,pos_to_pos,i-1,j,rec_k);
				i_j_pro[i][j]=mmax;
				most_state[i][j]=dict_pos[rec_k];
			}
		}
	}

	
	mmax=i_j_pro[test_pos.size()-1][0];  //qiu zuidazhi  pos_to_pos de diyilie fang pi
	state=dict_pos[0]; 
	for(int k=1;k<dict_pos.size();k++)
	{
		if(i_j_pro[test_word.size()-1][k]>mmax)
		{
			mmax=i_j_pro[test_pos.size()-1][k];
			rec_k=k;
		}
	}
	state=dict_pos[rec_k];                //Xn
	new_state[test_pos.size()-1]=state;
	for(k=test_pos.size()-1;k>=1;k--)
	{
		new_state[k-1]=most_state[k][rec_k];
		rec_k=pos_index[new_state[k-1]];
	}       

}
void tag_test(float **pos_to_pos,float **pos_to_word)
{
	string s ,s2,one_sentence,fir_str,s_fir_pos,s_sce_pos,s_word,sce_str;
	int end_null,fir_null,sce_null,k,i;
	ifstream fin1("test.txt");
	if(!fin1)
	{
		cerr<<"error 100 opening dict.txt";
		exit(100);
	}

	ofstream fout("result.txt");
	if(!fout)
	{
		cerr<<"error 100 opening result.txt";
		exit(100);
	}
    int line=1;
	float all_cor=0,all=0;
	while(getline(fin1,s))
	{
		if(s.length()!=0)  //tiqu yige juzi de ci he cixing ,bing cunru dongtai shuzu
		{
	        end_null=s.rfind("  ");
			
			fir_null=s.find("  ");  
			fir_str=s.substr(0,fir_null);
			k=fir_str.find_first_of('/'); 
			s_word=fir_str.substr(0,k);   //tiqu  yige ci,  qiu b.........
			s_fir_pos=fir_str.substr(k+1,2); //tiqu chu yige cixing
			test_pos.push_back(s_fir_pos); //cunchu cixing
			test_word.push_back(s_word);         //cunchu ci

			while(fir_null!=end_null)
			{
				sce_null=s.find("  ",fir_null+2);
				sce_str=s.substr(fir_null+2,sce_null-fir_null-2);   //tiqu dierge chuan
				k=sce_str.find_first_of('/');                 
				s_word=sce_str.substr(0,k);   //tiqu chu yige ci   qiu b.................
				s_sce_pos=sce_str.substr(k+1,2); //tiqu chu yige cixing
				test_pos.push_back(s_sce_pos); //cunchu cixing
				test_word.push_back(s_word);         //cunchu ci
				fir_null=sce_null;
			}

			string **most_state=new string* [test_word.size()];
			for(i=0;i<test_word.size();i++)
				most_state[i]=new string [dict_pos_size];

			string *new_state=new string [test_word.size()] ;

			float **i_j_pro=new float* [test_word.size()] ;
			for(i=0;i<test_word.size();i++)
				i_j_pro[i]=new float [dict_pos_size];

			viterbi(pos_to_pos,pos_to_word,i_j_pro,most_state,new_state);  ////diaoyong vertebi

			float num=0.0;		
			float pro;
			for(int i=0;i<test_word.size();i++)
			{
                all++;
				if(test_pos[i]==new_state[i])
				{
					num++;
					all_cor++;
				}
			}

			pro=num/test_word.size();	
			fout<<"line"<<line<<"      "<<num<<"    "<<test_word.size()<<"     "<<pro<<endl;
			line++;
			test_word.clear();   ///qingkong
			test_pos.clear();
		}
	}
	fout<<"the totla pro of correct is "<<all_cor<<"/"<<all<<"="<<all_cor/all<<endl;
	fin1.close();
	fout.close();
}
int main()
{
	read_dict();      //读词典
	float **pos_to_pos=new float* [dict_pos_size] ;  //词性转移 ,jian yige erwei shuzu 
	for(int row=0;row<dict_pos_size;row++)
	{
		pos_to_pos[row]=new float [dict_pos_size+1];   //di 0 lie fang pi
		for(int x=0;x<=dict_pos_size;x++)
			pos_to_pos[row][x]=0;  
	}

    float **pos_to_word=new float* [dict_word_size];
	int x;
	for(row=0;row<dict_word_size;row++)
	{
		pos_to_word[row]=new float [dict_pos_size];
		for(x=0;x<dict_pos_size;x++)
			pos_to_word[row][x]=0;      
	}
	cout<<"GENERATING HMM,WAIT FOR A MOMENT,PLEASE...."<<endl;
	count_pi_A_B(pos_to_pos,pos_to_word);  //dedao HMM
	pinghua(pos_to_pos,pos_to_word);   //平滑
	tag_test(pos_to_pos,pos_to_word);   //标注
	return 0;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -