⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 languagemodel.cpp

📁 解码器是基于短语的统计机器翻译系统的核心模块
💻 CPP
字号:
#include "LanguageModel.h"

using namespace std;

extern bool printmore;

LanguageModel::LanguageModel(double prob)
{
	UNKPROB = prob;
}


bool LanguageModel::load(string fileName, Vocab *vocab)//加载LM, srilm,文件为arpa-standard format
{
	string strTmp;
	vcb = vocab;
	if (!vcb) {
		cout << "english vocab is NULL!" << endl;
		return 0;
	}
	input.open(fileName.c_str(), std::ios::in);
	if (!input) 
	{
		cout << "Open Language Model" << fileName << "Error" << endl;
		return 0;
	}
	while (getline(input, strTmp)) 
	{
		int tabPositionStart = strTmp.find_first_of('\t');
		if (tabPositionStart != string::npos) 
		{
			string prob(strTmp, 0, tabPositionStart);
			int tabPositionEnd = strTmp.find_last_of('\t');
			ProbAndBO tmpProbAndBO;
			if (tabPositionEnd != string::npos) 
			{
				string backoffProb(strTmp, tabPositionEnd + 1, strTmp.length() - tabPositionEnd + 1);
				string gram(strTmp, tabPositionStart + 1, tabPositionEnd - tabPositionStart - 1);

				tmpProbAndBO.prob = atof(prob.c_str());
				tmpProbAndBO.backoffWeight = atof(backoffProb.c_str());
				vector<int> gramIDs;
				int numSpace = vcb->getIndices(gram, gramIDs);
				if (numSpace == 1) {
					uniGram.insert(make_pair(gramIDs, tmpProbAndBO));
				}
				else if (numSpace == 2) {
					biGram.insert(make_pair(gramIDs, tmpProbAndBO));
				}
				else if (numSpace == 3) {
					triGram.insert(make_pair(gramIDs, tmpProbAndBO));
				}
			}
			else
			{
				string gram(strTmp, tabPositionStart + 1, strTmp.length() - tabPositionStart + 1);
				tmpProbAndBO.prob = atof(prob.c_str());
				tmpProbAndBO.backoffWeight = 0;
				vector<int> gramIDs;
				int numSpace = vcb->getIndices(gram, gramIDs);
				if (numSpace == 1) {
					uniGram.insert(make_pair(gramIDs, tmpProbAndBO));
				}
				else if (numSpace == 2) {
					biGram.insert(make_pair(gramIDs, tmpProbAndBO));
				}
				else if (numSpace == 3) {
					triGram.insert(make_pair(gramIDs, tmpProbAndBO));
				}	
			}
		}
	}
	input.clear();
	input.close();
	if(printmore) {
	cout << "Loading language model finished!" << endl;
	}

	return 1;
}

double LanguageModel::uniProb(int word)
{
	vector<int> vecTmp;
	vecTmp.push_back(word);
	mapNgram::iterator pos = uniGram.find(vecTmp);
	double p = INFINITE;
	if (pos != uniGram.end()) {
		p = (pos->second).prob * LN;
	}
	if (p - UNKPROB < avs) {
			return UNKPROB;
	}
	return p;

}

double LanguageModel::uniBO(int word)
{
	vector<int> vecTmp;
	vecTmp.push_back(word);
	mapNgram::iterator pos = uniGram.find(vecTmp);
	if (pos != uniGram.end()) {
		return (pos->second).backoffWeight * LN;
	}
	else
	{
		return UNKBO;
	}	
}
//p(wd2|wd1)= if(bigram exists) p_2(wd1,wd2)
//            else              bo_wt_1(wd1)*p_1(wd2)
double LanguageModel::biProb(int word1, int word2)
{
	vector<int> vecTmp;
	vecTmp.push_back(word1);
	vecTmp.push_back(word2);
	mapNgram::iterator pos = biGram.find(vecTmp);
	double p = INFINITE;
	if (pos != biGram.end()) {
		p = (pos->second).prob * LN;	
	}
	else {
		p = uniBO(word1) + uniProb(word2);
	}
	if (p - UNKPROB < avs) {
			return UNKPROB;
	}
	return p;
}

double LanguageModel::biBO(int word1, int word2)
{
	vector<int> vecTmp, vecTmp1, vecTmp2;
	vecTmp.push_back(word1);
	vecTmp.push_back(word2);
	mapNgram::iterator pos = biGram.find(vecTmp);
	if (pos != biGram.end()) 
	{
		return (pos->second).backoffWeight * LN;
	}
}

//p(wd3|wd1,wd2)= if(trigram exists)           p_3(wd1,wd2,wd3)
//                else if(bigram w1,w2 exists) bo_wt_2(w1,w2)*p(wd3|wd2)
//                else                         p(wd3|w2)

double LanguageModel::triProb(int word1, int word2, int word3)
{
	vector<int> vecTmp, vecTmp12;
	vecTmp.push_back(word1);
	vecTmp.push_back(word2);
	vecTmp.push_back(word3);
	vecTmp12.push_back(word1);
	vecTmp12.push_back(word2);

	mapNgram::iterator pos = triGram.find(vecTmp);
	double p = INFINITE;
	if (pos != triGram.end()) {
		p = (pos->second).prob * LN;
		
	}
	else if (biGram.find(vecTmp12) != biGram.end()) 
	{
		p = (biBO(word1, word2) + biProb(word2, word3)) ;
	}
	else
	{
		p = biProb(word2, word3);
	}
	if (p - UNKPROB < avs) {
		return UNKPROB;
	}
	return p;

}

double LanguageModel::wordProb(vector<int> Ephrase)
{
	int len = Ephrase.size();
	if (len == 1) {
		return  uniProb(Ephrase[0]);
	}
	else if (len ==2) {
		return  biProb(Ephrase[0], Ephrase[1]);
	}
	else if (len == 3) {
		return  triProb(Ephrase[0], Ephrase[1],Ephrase[2]);
	}
	else {
		return  triProb(Ephrase[len - 3], Ephrase[len - 2],Ephrase[len - 1]);
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -