hypothesis.cpp

来自「解码器是基于短语的统计机器翻译系统的核心模块」· C++ 代码 · 共 712 行 · 第 1/2 页
CPP
712 行
#include "Hypothesis.h"
#include <cmath>
#include <cctype>
#include <iterator>

using namespace std;


extern bool printmore;

extern VECPOOL vecHypo;
extern VECUNSED vecNotUsed;

Hypothesis::Hypothesis(double thresholdI, int stackThreshold, int nBest, double dislimit, double lmlimit, int len)
{
	threshold = log(thresholdI);
	eachStackSize = stackThreshold;
	NBEST = nBest;
	DISTORTIONLIMIT = dislimit;
	LMLimit = lmlimit;
	DISLENGHT = len;
}

bool Hypothesis::load(string inifileName)
{
	config.load(inifileName, para);
	cout << "set the parameters ... " << endl;
	enVcb = new Vocab();
	cnVcb = new Vocab();
	if ((!enVcb) || (!cnVcb)) {
		cout << "new Vocab error!" << endl;
	}
	lm = new LanguageModel(LMLimit);
	cout << "load language model from " << para.lmodel_file << endl;
	time_t lmold, lmnew;
	time(&lmold);
	if(!lm->load(para.lmodel_file, enVcb))
	{
		cout << "load language model " << para.lmodel_file << " error !" << endl;
		return 0;
	}
	time(&lmnew);
	cout << "language model load finished, it takes " << difftime(lmnew, lmold) << " seconds!" << endl;
	cout << "load phrase table from " << para.ttable_file << endl;
	to = new TransOptions(para);
	time_t toold, tonew;
	time(&toold);
	if (!to->load(para.ttable_file, lm, enVcb, cnVcb)) {
		cout << "load phrase table " << para.ttable_file << " error !" << endl;
		return 0;
	}
	time(&tonew);
	cout << "phrase table load finished, it takes "<< difftime(tonew, toold) << " seconds!" << endl;
	return 1;
}

void Hypothesis::initialize(string sentence)
{
	
	cnVcb->senToIDs(sentence, sentenceIDS);
	enVcb->unkTMP = cnVcb->unkTMP;
	stackSize = to->getEPhrase(sentenceIDS, phraseSnippet, phraseSnippetPosition);
	hypothesisStack.resize(stackSize + 1);
	THRESHOLD.resize(stackSize + 1);

	int first = findProper(vecNotUsed);
	
	vecHypo[first].lastEWI = -1;
	vecHypo[first].lastEWII = enVcb->getIndex("<s>");
	vecHypo[first].lmScore = 0.0;
	vecHypo[first].prev = -1;
	vecHypo[first].thisID = 0;
	vecHypo[first].lastID = 0;
	

	hypothesisStack[0].push_back(first); //
}

void Hypothesis::clear()
{
	//release the resource
	phraseSnippet.clear();
	phraseSnippetPosition.clear();
	THRESHOLD.clear();
	HypothesisStack::iterator	pos;
	for(pos = hypothesisStack.begin(); pos != hypothesisStack.end(); ++pos)
	{
		pos->clear();
	}
	hypothesisStack.clear();
	
	vecNotUsed.clear();
	int poolsize = vecHypo.size();
	for (int n = 0; n < poolsize; n++)
	{
		vecNotUsed.push_back(n);
	}

	arc.clear();
	sentenceIDS.clear();
}

inline void Hypothesis::split(const string& line, vector<string>& strs)
{
	istrstream ist(line.c_str());
	string w;
	while(ist>>w) strs.push_back(w);
}

double Hypothesis::lmCal(int last1, int last2, vector<int> newPhrase, int& newlast1, int& newlast2)
{
	double lmScore = 0.0;
	if (newPhrase.size() == 0) 
	{
		return 0;
	}
	else
	{	
		vector<int> vecPhrase;
		if (last1 == -1) 
		{
			vecPhrase.push_back(last2);
			int l = newPhrase.size();
			for(int i = 0; i < l; i++)
			{
				vecPhrase.push_back(newPhrase[i]);
			}
			int len = vecPhrase.size();
			newlast1 = vecPhrase[len - 2];
			newlast2 = vecPhrase[len - 1];
			int endS = enVcb->getIndex("</s>");
			if (newPhrase[0] != endS) {
				vector<int>::iterator pos = vecPhrase.end();
				for(int i = 0; i < len - 1; i++)
				{
					double lmTmp = lm->wordProb(vecPhrase) * para.weight_l;
					if(printmore) {
						cout << "\tlanguage model cost for '" << enVcb->getWord(vecPhrase[vecPhrase.size() - 1]) << "' " << lmTmp << endl; 			
					}
					lmScore += lmTmp;
					vecPhrase.erase(--pos);
				}			
			}
			else {
				lmScore = lm->wordProb(vecPhrase) * para.weight_l;
				if(printmore) {
					cout << "\tlanguage model cost for '" << enVcb->getWord(vecPhrase[vecPhrase.size() - 1]) << "' " << lmScore << endl; 			
				}
			}		
		}
		else
		{
			vecPhrase.push_back(last1);
			vecPhrase.push_back(last2);
			//			split(newPhrase, vecPhrase);
			int l = newPhrase.size();
			for(int i = 0; i < l; i++)
			{
				vecPhrase.push_back(newPhrase[i]);
			}
			int len = vecPhrase.size();
			newlast1 = vecPhrase[len - 2];
			newlast2 = vecPhrase[len - 1];
			
			int endS = enVcb->getIndex("</s>");
			if (newPhrase[0] != endS) {
				vector<int>::iterator pos = vecPhrase.end();
				for(int i = 0; i < len - 2; i++)
				{
					double lmTmp = lm->wordProb(vecPhrase) * para.weight_l;
					if(printmore) {
						cout << "\tlanguage model cost for '" << enVcb->getWord(vecPhrase[vecPhrase.size() - 1]) << "' " << lmTmp << endl; 			
					}				
					lmScore += lmTmp;
					vecPhrase.erase(--pos);
				}
				//		lmScore = lmScore + 1;
			}
			else {
				lmScore = lm->wordProb(vecPhrase) * para.weight_l;
				if (printmore){
					cout << "\tlanguage model cost for '" << enVcb->getWord(vecPhrase[vecPhrase.size() - 1]) << "' " << lmScore << endl; 			
				}
			}
		}
	}
	return lmScore;
}

double Hypothesis::fcCal(set<int> phraseID, int stackSize)
{
	set<int>::iterator pos = phraseID.begin();
	set<int>::iterator posOld = phraseID.begin();
	pos++;
	posOld++;
	int len = phraseID.size();
	double fcScore = 0.0;
	
	if (*pos != 0) {
		fcScore += to->getFutureCost(0, *pos - 1);
		posOld = pos;
		pos++;	
	}
	for(; pos != phraseID.end(); ++pos)
	{
		if(*pos - 1 > *posOld)
		{
			fcScore += to->getFutureCost(*posOld + 1, *pos - 1);
		}
		posOld = pos;
	}
	
	if (stackSize - 1 >= *posOld + 1) {
		int i = *posOld + 1;
		fcScore += to->getFutureCost(i, stackSize - 1);
	}
	return fcScore;
}

string Hypothesis::decoder(string fileName)
{
	
	for(int i = 0; i < stackSize; i++)
	{
		if ((i - 1 > 0) && (hypothesisStack[i].size() > eachStackSize)) {
			cutStack(hypothesisStack[i], eachStackSize, i);
		}
		multisetHYE::iterator posI = hypothesisStack[i].begin();
		for(; posI != hypothesisStack[i].end(); ++posI)
		{
			HypothesisElement hyp = vecHypo[*posI];
			
			int t = hyp.oldphrase.size();
			set<int> setTmp;
			
			if (t > 0) {
				setTmp = hyp.oldphrase;
			}
			int len = phraseSnippet.size();
			for(int j = 0; j < len; j++)
			{	
				
				fPosition posTmp = phraseSnippetPosition[j];
				if((setTmp.find(posTmp.start) == setTmp.end()) && (setTmp.find(posTmp.end) == setTmp.end()))
				{
					candiPhrase::iterator pos;
					for(pos = phraseSnippet[j]->begin(); pos != phraseSnippet[j]->end(); ++pos)
					{
						aboutEPhrase onePhrase = **pos;
					//	HypothesisElement *newhyp = new HypothesisElement();
						int newhyp = findProper(vecNotUsed);
						if (newhyp == -1) {
							int oldsize = vecHypo.size();
							vecHypo.resize(2 * oldsize);
							for (int n = oldsize; n < 2 * oldsize; n++)
							{
								vecNotUsed.push_back(n);
							}
						//	vecNotUsed.resize(2 * oldsize);
							newhyp = oldsize;
						}
					
						
						vecHypo[newhyp].thisID = ++HypothesisElement::baseID;
						
						vecHypo[newhyp].lastID = hyp.thisID;
						
						vecHypo[newhyp].baseScore = hyp.totalScore - hyp.futureScore;
						vecHypo[newhyp].transScore = onePhrase.pC;
						vecHypo[newhyp].newPhrase = onePhrase.ephrase;
						int translationCost = abs(hyp.lastPos.end + 1 - posTmp.start);
						if ((DISLENGHT == 0) || (translationCost <= DISLENGHT)) {
							vecHypo[newhyp].distortionScore = translationCost * para.weight_d * (-1);
						}
						else 
						{
							vecHypo[newhyp].distortionScore = DISTORTIONLIMIT;//log(0.1)
						}
						
						vecHypo[newhyp].wordsPenalty = onePhrase.ephrase.size() * para.word_penalty * (-1);
						vecHypo[newhyp].lastPos = posTmp;
						
						vecHypo[newhyp].oldphrase = hyp.oldphrase;
						if(printmore) {
							cout << "creating hypothesis " << vecHypo[newhyp].thisID << " from "<< vecHypo[newhyp].lastID << endl;
							cout << "\tbase score " << vecHypo[newhyp].baseScore << endl;
							cout << "\ttranslation cost " << vecHypo[newhyp].transScore << endl;
							cout << "\tdistortion cost " << vecHypo[newhyp].distortionScore << endl;
							
						}
						
						for(int t = posTmp.start; t <= posTmp.end; t++)
						{
							vecHypo[newhyp].oldphrase.insert(t);
						}
						
						vecHypo[newhyp].lmScore = lmCal(hyp.lastEWI, hyp.lastEWII, onePhrase.ephrase, vecHypo[newhyp].lastEWI, vecHypo[newhyp].lastEWII);
		
						if (vecHypo[newhyp].oldphrase.size() == stackSize + 1) {//扩展完所有的外文词后，在英文句尾添加</s>
							int str1, str2;          //加1是因为之前插入了一个-1
							int endofsent = enVcb->getIndex("</s>");
							vector<int> vecSentEnd;
							vecSentEnd.push_back(endofsent);
							double tail = lmCal(vecHypo[newhyp].lastEWI, vecHypo[newhyp].lastEWII, vecSentEnd, str1, str2) ;				
							vecHypo[newhyp].lmScore +=  tail;						
						}
						vecHypo[newhyp].futureScore = fcCal(vecHypo[newhyp].oldphrase, stackSize);
						
						vecHypo[newhyp].totalScore = vecHypo[newhyp].baseScore + vecHypo[newhyp].transScore + vecHypo[newhyp].distortionScore + vecHypo[newhyp].lmScore
							+ vecHypo[newhyp].wordsPenalty + vecHypo[newhyp].futureScore;
						if(printmore) {
							cout << "\tword penalty " << vecHypo[newhyp].wordsPenalty << endl;
							cout << "\tscore " << vecHypo[newhyp].totalScore - vecHypo[newhyp].futureScore << " + futureCost " << vecHypo[newhyp].futureScore << " = " << vecHypo[newhyp].totalScore << endl;
						}
						//假设入栈 						
						vecHypo[newhyp].prev = *posI;
						recombineAndbeam(newhyp);
					}		
				}
			}
		}
	}
	if(printmore) {
		cout << "decode finished !" << endl;
	}
	if (NBEST == 1) {
		return findBest();
	}
	else {
		string suffix = ".";
		int itmp = stackSize;
		while (itmp--) {
			suffix += "0";
		}
		suffix = fileName + suffix;
		findNBest(suffix);
		return suffix;
	}
}

void Hypothesis::recombineAndbeam(int newHyp)
{
	int len = vecHypo[newHyp].oldphrase.size() - 1;//which stack to input
	int stackIsize = hypothesisStack[len].size();
	if (stackIsize != 0) 
	{
		if (vecHypo[newHyp].totalScore - THRESHOLD[len] + threshold > avs) {
			THRESHOLD[len] = vecHypo[newHyp].totalScore + threshold;
			if(printmore) {
				cout << "new best estimate for this stack" << endl;
			}
		}
		if (vecHypo[newHyp].totalScore - THRESHOLD[len] > avs) 
		{
			//THRESHOLD = newHyp.totalScore + threshold;
			multisetHYE::iterator POS = hypothesisStack[len].begin();
hypothesis.cpp - 源码说明

本页面展示了「解码器是基于短语的统计机器翻译系统的核心模块」中的 hypothesis.cpp 源码文件，采用 C++ 编程语言编写，共 712 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与解码器相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?