📄 hmm_old.cpp

📁 一个用于词性标注的 HMM程序。包含训练和测试功能。
💻 CPP
字号:
////////////////////////////////////////////////
#include<stdio.h>
#include<iostream>
#include<fstream>
#include<string>
#include<time.h>
using namespace std;


typedef struct c
{
	int tag;
	int count;//存放(wl,tj)或(ti,tj)出现的次数
	struct c *next;
}Count;

typedef struct wt
{
	Count *link;
}Word;

//存放count(wl,tj) 
Word WordTag[100000];

typedef struct tt
{
	Count *link;
	int count;  //存放tj出现大的次数
}Tag;

////存放count(ti.tj)
Tag TagTag[600];

//存放word种类，下标对应其在Word_Tag中的位置
struct wc
{
	string str;  
	int index;
}WordClass[100000]; 
int slen=0;

//存放tag种类，下标对应其在Tag_Tag中的位置
struct tc
{
	string str;  
	int index;
}TagClass[600];
int tlen=0;


void initial()
{
	int i=0;
	while(i<100000)
	{
		WordTag[i].link=NULL;
		if(i<600)
		{
			TagTag[i].count=0;
			TagTag[i].link=NULL;
		}
		i++;
	}
}


/////////////////////////////////////////////////////////////////
////////                                 训练部分
////////
////////////////////////////////////////////////////////////////

/////对字符串编码，采用折半插入法
int GetWordIndex(string str)
{
	int i=1,low,high,mid;
	if(str[0]<='Z'&&str[0]>='A')
		str[0]=str[0]+32;
	low=1;
	high=slen;
	string s1;
	while(low<=high)
	{
		mid=(low+high)/2;
		s1=WordClass[mid].str;
	 	if (s1.compare(str)==0) 
			return WordClass[mid].index;
		if (s1.compare(str)>0)
			high=mid-1;
		else 
			low=mid+1;
	}
	for(i=slen;i>=low;--i)
	{
		WordClass[i+1].str=WordClass[i].str;
		WordClass[i+1].index=WordClass[i].index;
	}
	WordClass[low].str=str;
	slen++;
	WordClass[low].index=slen;	 
	return slen;
}
 
/////////////对标记编码，采用折半插入法
int GetTagIndex(string str)
{
	int i=1,low,high,mid;
	low=1;
	high=tlen;
	string s1;
	while(low<=high)
	{
		mid=(low+high)/2;
		s1=TagClass[mid].str;
	 	if (s1.compare(str)==0) 
			return TagClass[mid].index;
		if (s1.compare(str)>0)
			high=mid-1;
		else 
			low=mid+1;
	}
	for(i=tlen;i>=low;--i)
	{
		TagClass[i+1].str=TagClass[i].str;
		TagClass[i+1].index=TagClass[i].index;
	}
	TagClass[low].str=str;
	tlen++;
	TagClass[low].index=tlen;	 
	return tlen;
}

 
///////构建结构体，存放c(w,t)
void Putword(int sindex,int tindex)
{
	Count *p,*q;
	p=WordTag[sindex].link;
	while(p!=NULL)
	{
		if(p->tag==tindex)
		{
			p->count++;
			return;
		}
		q=p;
		p=p->next;
	}
	if(WordTag[sindex].link==NULL)
	{
		WordTag[sindex].link=new Count;
	    p=WordTag[sindex].link;
	}
	else
	{
		p=new Count;
		q->next=p;
	}
	p->next=NULL;
	p->count=1;
	p->tag=tindex;
	return;
}

 
///////////构建结构体，存放c(ti,tj)
void Puttag(int fmr,int curr)
{
	Count *p,*q;
	p=TagTag[fmr].link;
	TagTag[fmr].count++;
	while(p!=NULL)
	{
		if(p->tag==curr)
		{
			p->count++;
			return;
		}
		q=p;
		p=p->next;
	}
	if(TagTag[fmr].link==NULL)
	{
		TagTag[fmr].link=new Count;
	    p=TagTag[fmr].link;
	}
	else
	{
		p=new Count;
		q->next=p;
	}
	p->next=NULL;
	p->count=1;
	p->tag=curr;
	return;
}

 
////////////////训练
void on_training()
{
	int time[]={44,27,17,17,36,48,75,30,0,80,29,24,16,29,0,29};
	int t=0,i=0,j=0,k=0;
	int p,sindex,tindex;
	char filename[11]="train/ca01";
	 
	string str,tag,word;
	int fmr=0,curr=0;
	
	for(t=0;t<16;t++)
	{
		filename[7]='a'+t;		
		for(i=1;i<=time[t];i++)
		{
			
			filename[8]=(i/10)+48;
			filename[9]=(i%10)+48;
			filename[10]='\0';
		 	ifstream infile(filename);
			while(infile>>str)
			{			
				p=str.find('/');
				word=str.substr(0,p);
				tag=str.substr(p+1,str.size());
				sindex=GetWordIndex(word);
				tindex=GetTagIndex(tag);
				Putword(sindex,tindex);
				curr=tindex;
				Puttag(fmr,curr);
				if(tag==".")
					if(word=="!"||word=="?"||word==";")
						infile>>str;
			
			 	if(tag==".")
				{
					fmr=0;
					curr=0;	j++;		
				}
				else 
					fmr=curr;				
			}
		}
	}
}
/////////////////////////////////////////////////////////////////////
 


//////////////////////////////////////////////////////////
///////////                    测试部分
//////////
//////////////////////////////////////////////////////////

///////求p(tk|tj)的概率 ，采用平滑
double PTT(int k,int t)
{
	Count *p;
	p=TagTag[k].link;
	double c,ck=TagTag[k].count;
	while(p!=NULL)
	{
		if(p->tag==t)
		{
			c=p->count;
			return(c/ck);
		}
		p=p->next;
	}
	return(0);
}
 
 
///////////求word的索引号，采用二分法
int IndexWord(string str)
{
	int i=1,low,high,mid;
	if(str[0]<='Z'&&str[0]>='A')
		str[0]=str[0]+32;
	low=1;
	high=slen;
	string s1;
	while(low<=high)
	{
		mid=(low+high)/2;
		s1=WordClass[mid].str;
	 	if (s1.compare(str)==0) 
			return WordClass[mid].index;
		if (s1.compare(str)>0)
			high=mid-1;
		else 
			low=mid+1;
	}
	return -1;
}

///////////////求tag的索引号，采用二分法
int IndexTag(string str)
{
	int i=1,low,high,mid;
	low=1;
	high=tlen;
	string s1;
	while(low<=high)
	{
		mid=(low+high)/2;
		s1=TagClass[mid].str;
	 	if (s1.compare(str)==0) 
			return TagClass[mid].index;
		if (s1.compare(str)>0)
			high=mid-1;
		else 
			low=mid+1;
	}
	return -1;
}

 


//////////测试
void test()
{
	int i=1,j=0,k=0,len=0,file=0,correct=0;
	int row=0;
	int totalword=0,totalcorrect=0;
	int wordcode[400],tagcode[400];
	double largeend1[600]={0},largeend2[600]={0},max;
	 
	int path[400][600],state[600];
	int w,t;
    largeend1[0]=1.0;
	string str,str1,str2;
	char filename[11]="test/cr01";	
	double pwt,ptt;
	ofstream outfile("result/result.txt");
  
	for(file=1;file<=9;file++)
	{
		row=0;
		//filename[2]=(i/10)+48;
		filename[8]=(file%10)+48;
		filename[9]='\0';
	    ifstream infile(filename);
 
		while(infile>>str)
		{   //分割词与标记
			j=str.find('/');
			str1=str.substr(0,j);
			str2=str.substr(j+1,str.size());
			wordcode[i]=IndexWord(str1);//求该词的索引号
			tagcode[i]=IndexTag(str2);//求tag的索引号
		 
			 
			if(str2==".")
				if(str1=="!"||str1=="?"||str1==";")
				{
					infile>>str; 					 
				}
			
			if(str2==".")/////////////////测试算法
			{
				len=i;
				for(j=1;j<=len;j++)
				{
					w=wordcode[j];
			 
					if(w==-1)//这个词在语料库中未出现过的单词
					{   
						pwt=0.0000001;//给一个平滑值
						for(t=1;t<=tlen;t++)///对所有标记					
						{								 
							for(k=0;k<=tlen;k++)//求最大
							{
								ptt=PTT(k,t);							 
								if(largeend2[t]<largeend1[k]*pwt*ptt)
								{
									largeend2[t]=largeend1[k]*pwt*ptt;
									path[j][t]=k;
								}
							}						 
						}
					}
					else//在预料库中出现的单词
					{
						Count *p=WordTag[w].link;
						while(p!=NULL)
						{
							t=p->tag;
							double cwt=p->count;//求count(wj,tj);
							double ct=TagTag[t].count;//求count(tj);
							pwt=cwt/ct;
							for(k=0;k<=tlen;k++)//求最大
							{
								ptt=PTT(k,t);							 
								if(largeend2[t]<largeend1[k]*pwt*ptt)
								{
									largeend2[t]=largeend1[k]*pwt*ptt;
									path[j][t]=k;
								}
							}
							p=p->next;
						}
					}
					for(t=0;t<600;t++)
					{
						largeend1[t]=largeend2[t];
						largeend2[t]=0;
					}
				}
				max=largeend1[1];state[len]=1;
				/////////////////X[n+1]
				for(t=1;t<=tlen;t++)
				{
					if(max<largeend1[t])
					{
						max=largeend1[t];
						state[len]=t;
					}
				}
				/////////////////////////////概率最大路经
				for(t=len-1;t>=1;t--)
					state[t]=path[t+1][state[t+1]];

				for(t=1;t<=len;t++)///////////////////正确率
				{ 
					if(tagcode[t]==state[t])
						correct++;
				}			 
				totalword+=len;
				row++;
				totalcorrect+=correct;				
				outfile<<filename<<": Sentence "<<row<<":   "<<correct<<":"<<len<<"\n";
				i=1;len=0;correct=0;
				largeend1[0]=1.0;			 
			}
			else 
				i++;
		}
	}
	outfile<<"Sum=>>"<<" sum correct:  "<<totalcorrect<<"sum words:  "<<totalword<<"\n";

} 

void main()
{
	time_t s,e; 
	initial();
	time(&s);
	cout<<ctime(&s)<<"\n";
	cout<<"Begining to train the HMM,please wait..........\n";
	on_training();	 
	time(&e);
	cout<<ctime(&e)<<"\n";
	cout<<"End training,Begining to test the file\n";
	test();
	time(&s);
	cout<<ctime(&s)<<"\n"; 
	cout<<"program end,please look at the file : result.txt!\n";  
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -