⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 xiaoqichuli.cpp

📁 利用贝叶斯分类原理实现多义词的消歧。首先利用训练语料进行训练
💻 CPP
字号:
// xiaoqichuli.cpp: implementation of the xiaoqichuli class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
//#include "DMBGT.h"
#include "xiaoqichuli.h"
#include <algorithm>
#include "math.h"
//#define LAM 0.00000001

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

xiaoqichuli::xiaoqichuli()
{
	CreateWordIDList();
	CreatePskList();
}

xiaoqichuli::~xiaoqichuli()
{
//	AfxMessageBox(" testing is over!");
}
////////////////////////////////////////////////////
void xiaoqichuli::CreateWordIDList()
{
	ifstream finID("Cup_词ID词义ID表.txt");
	wordID wordid;
	string wordTmp;
	string strCompare = "";
	int ID;
	//float P;
	string P;
	while(finID>>wordTmp>>ID>>P)
	{
		if(wordTmp != strCompare)
		{
			strCompare  = wordTmp;
			wordid.word = wordTmp;
			wordid.IDw  = ID;
			vectWordID.push_back(wordid);
		}		
	}
}
////////////////////////////////////////////////////
void xiaoqichuli::CreatePskList()
{
	fstream finPsk("Psk-hebing.txt",ios::in);
	wordPsk wordpsk;
	int   idw;
//	int   ids;
	string ids;
	float psk;
	float P_LAM;
	long  B;
	long  E;
	while(finPsk>>idw>>ids>>psk>>P_LAM>>B>>E)
	{
		wordpsk.IDw   = idw;
		wordpsk.IDs   = ids;
		wordpsk.Psk   = psk;
		wordpsk.P_LAM = P_LAM;
		wordpsk.begin = B;
		wordpsk.end   = E;
		vectWordPsk.push_back(wordpsk);
	}	
	sort(vectWordPsk.begin(), vectWordPsk.end());
}

////////////////////////////////////////////////////
void xiaoqichuli::ReadCorpus(string Sentence, char * chSentence)
{
//	string Sentence;
	string m_sentence;

	word_Tmp wordTmp;
	int m_count = 0;
	
	int  m_pointer   = 0;
    int  m_WordBegin = 0;
	int  m_PartEnd   = 0;
	while((m_pointer=Sentence.find("/",m_pointer+1)) != -1)
	{
		wordTmp.word     = Sentence.substr(m_WordBegin,m_pointer-m_WordBegin);
		m_count ++;
		m_PartEnd        = Sentence.find(" ",m_pointer);
		m_WordBegin      = m_PartEnd + 1;
		vectWordTmp.push_back(wordTmp); 
	}
    SearchIDw();
	CountSore(m_sentence);
	strcpy(chSentence, m_sentence.c_str());
	
}
/////////////////////////////////////////////////////

void xiaoqichuli::SearchIDw()
{
	unsigned i = 0;
	vector<wordID>::iterator pIt;
	wordID ToSearch;
	int IDTmp;                                    //临时IDw
	for(i; i < vectWordTmp.size(); i++)
	{
		ToSearch.word = vectWordTmp[i].word;
		pIt = lower_bound(vectWordID.begin(),vectWordID.end(),ToSearch);
		if(pIt != vectWordID.end() && pIt->word == vectWordTmp[i].word)
		{
			IDTmp     = pIt->IDw;
		}
		else 
		{
			IDTmp     = -1;                     //找不到的令IDw为-1
		}
		vectIDTmp.push_back(IDTmp);         //每句的词汇IDw压入临时表
	}
}

////////////////////////////////////////////////////////////////////////////////////
void xiaoqichuli::ReadPvs(vector<wordPvs> vectPvs, vector<wordPsk> ::iterator pIt)
{
	wordPvs Els;
	wordPvs PvsTmp;
	int i = 0;
	int j = (pIt->end - pIt->begin)/sizeof(wordPvs);
	fstream finPvs("Pvs-hebing", ios::in|ios::binary);
	finPvs.seekg(pIt->begin);
	for(i; i < j; i++)
	{		
		finPvs.read((char *) & Els,sizeof(wordPvs));
		PvsTmp.IDw = Els.IDw;
		PvsTmp.Pvs = Els.Pvs;
		vectPvs.push_back(PvsTmp);

	}
	int m = vectPvs.size();
}

vector<wordPvs> xiaoqichuli::_ReadPvs(vector<wordPsk> ::iterator pIt)
{
	vector<wordPvs> vectPvs;
	wordPvs Els;
	wordPvs PvsTmp;
	int i = 0;
	int j = (pIt->end - pIt->begin)/sizeof(wordPvs);
	fstream finPvs("Pvs-hebing", ios::in|ios::binary);
	finPvs.seekg(pIt->begin);
	for(i; i < j; i++)
	{		
		finPvs.read((char *) & Els,sizeof(wordPvs));
		PvsTmp.IDw = Els.IDw;
		PvsTmp.Pvs = Els.Pvs;
		vectPvs.push_back(PvsTmp);

	}
	int m = vectPvs.size();
	return vectPvs;
}


//////////////////////////////////////////////////////////////////////////////////////

void xiaoqichuli::CountPvs(int i, vector<wordPvs> vectPvsTmp, vector<wordPsk> ::iterator pIt)
{
	vector<int>                vectConID;
	float                      Pvs;
	vector<wordPvs> ::iterator pIt1;
	pair<double,string>           SoreTmp;
	SoreTmp.first = 0;
	SoreTmp.second = pIt->IDs;

	wordPvs Tosearch;
	vectConID = vectIDTmp;                     
	vectConID.erase(vectConID.begin() + i);       //确定上下文的IDw并放入向量中
	int n = 0, m = vectConID.size();
	int count = 0;

	if(count ==0)
	{
		SoreTmp.first = log(pIt->Psk);
		vectSore.push_back(SoreTmp);                         //将Sore值计入表中
	}
	else
	{
		SoreTmp.first += log(pIt->Psk);                            //乘上Psk并取对数
		vectSore_T.push_back(SoreTmp);                             //将Score值计入表中
		SoreTmp.first = log(pIt->Psk);          
		vectSore.push_back(SoreTmp);                               //只考虑发射概率的Score              
		
	}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////
void xiaoqichuli::CountSore(string & m_sentence)
{
	vector<wordPsk> ::iterator pIt;
	vector<wordPvs> vectWordPvsTmp;
	wordPsk ToSearch;
	PP pIt_tmp1, pIt_tmp2;
	PP pit;
	vector<PP> vectPit;     //  记录歧义词汇的位置
	int n_count = 0;
//	int N;
	
	int i =0;
	int j =vectWordTmp.size();
	for(i; i < j; i++)                  //计算每个词汇的IDs
	{		
		if(vectIDTmp[i] == -1)          //IDw = -1 的词汇是在表中找不到的
		{
			vectWordTmp[i].IDs = "-1";
		}
		else                            //对多义词进行消岐
		{
			ToSearch.IDw = vectIDTmp[i];
			if(vectIDTmp[i] == 20682)
				Sleep(1);
			pIt = lower_bound(vectWordPsk.begin(),vectWordPsk.end(),ToSearch);
			if(pIt != vectWordPsk.end() && pIt->IDw == vectIDTmp[i])
			{
				pIt_tmp1 = pIt - 1;
				pIt_tmp2 = pIt + 1;
				if((pIt_tmp1->IDw != pIt->IDw) && (pIt_tmp2->IDw != pIt->IDw))
				{
					vectWordTmp[i].IDs = pIt->IDs;             //没有歧义
				}	
				else //if(pIt->begin == pIt->end)                //有歧义,但这个不符合条件
				{
					pit = pIt+1;
					for(pIt; pIt != vectWordPsk.begin(); pIt--)
					{
						if(pIt->IDw == vectIDTmp[i]) 
						{
//							ReadPvs(vectWordPvsTmp, pIt);
							vectWordPvsTmp = _ReadPvs(pIt);
							CountPvs(i, vectWordPvsTmp, pIt);
							vectWordPvsTmp.clear();
						}
						else
							break;
					}
					
					for(pit; pit != vectWordPsk.end(); pit++)
					{
						if(pit->IDw == vectIDTmp[i]) 
						{
//							ReadPvs(vectWordPvsTmp, pit);
							vectWordPvsTmp = _ReadPvs(pit);
							CountPvs(i, vectWordPvsTmp, pit);
							vectWordPvsTmp.clear();
						}
						else
							break;
					}


					if(vectSore_T.size() >0)
					{
						unsigned int y = 1;
						double STmp =vectSore_T[0].first;
						vectWordTmp[i].IDs =vectSore_T[0].second ;                //确定最优的IDs
						for(y; y < vectSore_T.size(); y++)                        //计算最大的Sore值,并确定最优的IDs
						{
							if(vectSore_T[y].first > STmp)
							{
								STmp = vectSore_T[y].first;
								vectWordTmp[i].IDs =vectSore_T[y].second ;        //确定最优的IDs
							}
						}
						
					}
					else
					{
						unsigned int x = 0;
						double STmp =vectSore[0].first;
						vectWordTmp[i].IDs =vectSore[0].second;                    //确定最优的IDs
						for(x; x < vectSore.size(); x++)                           //计算最大的Sore值,并确定最优的IDs
						{
							if(vectSore[x].first > STmp)
							{							
								STmp = vectSore[x].first;
								vectWordTmp[i].IDs =vectSore[x].second;            //确定最优的IDs
							}
						}
						
					}
					vectSore.clear();
					vectSore_T.clear();
				}	
			}
			else
				AfxMessageBox("Can't find");
	
		}
	}
	unsigned int y;

	m_sentence = "";
	for(y = 0; y < vectWordTmp.size(); y ++)
	{
		m_sentence = m_sentence + vectWordTmp[y].word + "/" +
			vectWordTmp[y].IDs + " " ;
	}
	vectIDTmp.clear();
	vectWordTmp.clear();
}


⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -