method.cpp

来自「利用贝叶斯分类原理实现多义词的消歧。首先利用训练语料进行训练」· C++ 代码 · 共 313 行
CPP
313 行
// Method.cpp: implementation of the CMethod class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include <io.h>
#include <stdio.h>
//#include "DMBGT.h"
#include "Method.h"
#include <algorithm>
#define LAM 0.000001  //平滑参数
#define LAMDA 0.1     //平滑参数
#define NN 53335      //词汇表中词汇的总数

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

CMethod::CMethod()
{
	strTmp=" ";
//	ReadSenseNo();
}

CMethod::~CMethod()
{
	AfxMessageBox("train is over!");
}

void CMethod::ReadSenseNo()
{
	fstream finSen("Cup-Dic简表.txt",ios::in);
	string  str_Name;
	string  str_gender;
	string  SenseID;
	string  p;
	int     Count = 0;
	
	while(finSen>>str_Name>>str_gender>>SenseID>>p)
	{
		WordList.word.wordName = str_Name;
		WordList.word.senseID  = SenseID;
		if(str_Name != strTmp)
		{
			Count++;
			WordList.wordID = Count;
		}
		else
		{
			WordList.wordID = Count;
		}
		strTmp = str_Name;	
		WordList.wordCount = 0;
		WordList.Context_T = 0;
	    WordList.Psk       = 0;
		WordList.P_LAM     = 0;
		vectWordList.push_back(WordList);//把词写入词表
	}
	sort(vectWordList.begin(),vectWordList.end());
	

}

void CMethod::ReadSentence()
{
	fstream                 finSentence("hebing-修改.txt",ios::in);
	string                  sentence;
	vector<CWord>::iterator pPointer;
	CWord                   ToSearch;	
	ReadSenseNo();
	vector<int>             vectTmp; 
	PPP                     p;
	PPP                     pBefore;
	PPP                     pAfter;
	while(getline(finSentence, sentence))
	{
		                                //保存一个句子的所有词ID
		SplitLine(sentence);
		int i, j = vectword.size();
		for(i = 0; i < j; i ++)  //查出词汇的ID号码，并统计词频记录在表中
		{
			if(vectword[i].senseID == "-1")
			{
				vectWorID.push_back(-1);
				pPointer = 0;
				vectMid.push_back(pPointer); 

			}
			else
			{
				ToSearch.word = vectword[i];
				pPointer = lower_bound(vectWordList.begin(),vectWordList.end(),ToSearch);
				if(pPointer != vectWordList.end()&&pPointer->word == vectword[i])
				{
					pPointer->wordCount++;
					vectWorID.push_back(pPointer->wordID);                     //记录每个词的ID
					vectMid.push_back(pPointer);                //记录每个词在向量中的位置
				}
				else
				{
					pPointer = 0;
					vectWorID.push_back(-1);
				    vectMid.push_back(pPointer); 
				}                                       //找不到的词汇不处理
			}
			
		}

		int a, b = vectword.size();	//	
		for(a = 0; a < b; a++)
		{
			p = vectMid[a];
			
			if(p > 0)
			{
				pBefore = p - 1;
				pAfter  = p + 1;
				if(!(pBefore->wordID != p->wordID && pAfter->wordID != p->wordID))  //如果无歧义，不考虑上下文
				{		
					vectTmp = vectWorID;                              //保存一个句子的所有词ID
	    			vectTmp.erase(vectTmp.begin()+a);                 //保存上下文词的ID
					int h, c = vectTmp.size();
					for(h = 0; h < c; h++)
					{			
						CreateTree(p->Context_T,vectTmp[h]);					
					}		    
				}
			}
			
			
		}
		vectword.clear(); //处理完一句，清除该向量
		vectMid.clear();
		vectWorID.clear();
	}
	CountPara();
	CountParaII();
	
	vectWordList.clear();
}

void CMethod::SplitLine(string sentence)
{
	Word   word;
	string strSenNo;
	int    Pointer   = 0;
    int    WordBegin = 0;
    int    SenseEnd; 
	while((Pointer=sentence.find("/", Pointer + 1)) != -1)
	{
		word.wordName = sentence.substr(WordBegin, Pointer - WordBegin);       //词
		SenseEnd      = sentence.find(" ", Pointer);
		strSenNo      = sentence.substr(Pointer + 1, SenseEnd - Pointer -1);
//		word.senseID  = atoi(strSenNo.c_str());                              //词义标号
		word.senseID  = strSenNo;
		WordBegin     = SenseEnd + 1;
		vectword.push_back(word);                                            //句子中的所有词
	}
}


void CMethod::CreateTree(BTREE &T, int ID)
{
	if(T == NULL)
	{
		T = new CContext;
		T->ContextWordID      = ID;
		T->ContextWordCount   = 1;
		T->ChildL = T->ChildR = NULL;
	}
	else if(T->ContextWordID == ID)
		T->ContextWordCount ++;
	else if(T->ContextWordID > ID)
		CreateTree(T->ChildL, ID);
	else 
		CreateTree(T->ChildR, ID);
}

void CMethod::CountPara()
{
	int i,   j   = vectWordList.size();
	int ID_Tmp   = vectWordList[0].wordID;
	int Sum      = vectWordList[0].wordCount;
	vectWordList[0].P_LAM = (float)LAMDA / (float)(vectWordList[0].wordCount + NN * LAMDA);
	int n_count  = 1;
	float Psk;
	for (i = 1; i < j; i++)
	{
		vectWordList[i].P_LAM = (float)LAMDA / (float)(vectWordList[i].wordCount + NN * LAMDA);
		if(vectWordList[i].wordID == ID_Tmp)
		{
			Sum += vectWordList[i].wordCount;
			n_count ++;			
			if(i == j-1)                          // 表的尾部，最后一个元素
			{
				int g = i - n_count + 1;
				for(g; g <= i; g++)
				{
					if(Sum != 0)
					{
						Psk=(float)(vectWordList[g].wordCount + LAM)/(float)(Sum + n_count * LAM);		
					}						
					else 
					{
						Psk = (float)LAM;        //Psk=0; 令LAM = 0.00000001
					}
					vectWordList[g].Psk=Psk;
				}
				return;                   // 从尾部返回
			}	
		}
		else
		{			
			if(n_count != 1)
			{
				int k = i - n_count;
				for(k; k < i; k++)
				{
					if(Sum != 0)
					{
						Psk = (float)(vectWordList[k].wordCount + LAM)/(float)(Sum + n_count * LAM);		
					}						
					else 
					{
						Psk = (float)LAM;
					}
					vectWordList[k].Psk = Psk;
				}
			}
			else
			{
				if(Sum != 0)
				{
					Psk = 1;                      //  vectWordList[i-1].P_sen=1;
				}
				else 
				{
					Psk = (float)LAM;             //  vectWordList[i-1].P_sen=0;
				}
				vectWordList[i-1].Context_T = 0;  //  无歧义的的词汇其上下文可以不考虑
				vectWordList[i-1].Psk = Psk;
			}
			if(i == j-1)
			{
				if(vectWordList[i].wordCount > 0)
				{
					vectWordList[i].Psk = 1;
				}
				else 
				{
					vectWordList[i].Psk = (float)LAM;
				}
				return;			                    //  从尾部返回	
			}
			else                                    //  循环初始化条件
			{
				ID_Tmp=vectWordList[i].wordID;     
				Sum=vectWordList[i].wordCount;
				n_count=1;
			}			
		}
	}
}

void CMethod::CountParaII()
{
	int i, j = vectWordList.size();
	fstream foutPsk("Psk-hebing.txt", ios::out);
	fstream foutPvs("Pvs-hebing", ios::out|ios::binary);
	long begin,end;  
	for(i = 0; i < j; i++)
	{
		if(vectWordList[i].Context_T != NULL)
		{
			begin = foutPvs.tellp();                                        //记录输出的头位置
			IntravTree(foutPvs, vectWordList[i].Context_T,i);     //遍历数出树中的Pvs
			end = foutPvs.tellp();                                          //记录输出的尾位置
			foutPsk<<vectWordList[i].wordID<<" "<<vectWordList[i].word.senseID
			       <<" "<<vectWordList[i].Psk<<" "<<vectWordList[i].P_LAM<<" "<<begin<<" "<<end<<endl;//输出Psk
		}
		else
		{
			foutPsk<<vectWordList[i].wordID<<" "<<vectWordList[i].word.senseID
			       <<" "<<vectWordList[i].Psk<<" "<<vectWordList[i].P_LAM<<" "<<"0"<<" "<<"0"<<endl;
		}
	}
}

void CMethod::IntravTree(fstream & fout, BTREE & T, int k)
{
	wordPVS wordPvsTmp;
	if(T!=NULL)
	{
		IntravTree(fout, T->ChildL, k);
		float Pvs = ((float)(T->ContextWordCount + LAMDA)) / ((float)(vectWordList[k].wordCount + LAMDA * NN));        //vectWC[i-1].W_Sum);
		if(Pvs == 0)                                           //几乎不可能有Pvs == 0的情形
		{
			Pvs = vectWordList[k].P_LAM;                       //(float)LAM;
		}
		wordPvsTmp.IDw = T->ContextWordID;
		wordPvsTmp.Pvs = Pvs;
		fout.write((char *) & wordPvsTmp, sizeof(wordPVS));    //二进制输出
		IntravTree(fout, T->ChildR, k);
	}
	
}
method.cpp - 源码说明

本页面展示了「利用贝叶斯分类原理实现多义词的消歧。首先利用训练语料进行训练」中的 method.cpp 源码文件，采用 C++ 编程语言编写，共 313 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与贝叶斯相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?