transoptions.cpp

来自「解码器是基于短语的统计机器翻译系统的核心模块」· C++ 代码 · 共 347 行
CPP
347 行
#include "TransOptions.h"
#include <iostream>
#include <strstream>
#include <cmath>
#include <algorithm>

using namespace std;
extern bool printmore;

TransOptions::TransOptions(Para para)
{
	WLM = para.weight_l;
	WPROBEF = para.weight_ef;
	WLEXEF = para.weight_lexef;
	WPROBFE = para.weight_fe;
	WLEXFE = para.weight_lexfe;
	WPENALTY = para.penalty;	
	TTABLELIMIT = para.ttable_limit;
	PHRASELIMIT = para.phrase_limit; //短语长度限制
	WEIGHTW = para.word_penalty;
}

bool TransOptions::load(string fileName, LanguageModel *lm, Vocab *evocab, Vocab *cvocab)
{

	ENVOCAB =evocab;
	if (!ENVOCAB) {
		cout << "english vocab is NULL!" << endl;
		return 0;
	}

	CNVOCAB = cvocab;
	if (!CNVOCAB) {
		cout << "foreign vocab is NULL!" << endl;
		return 0;
	}

	LM = lm;
	string strTmp;
	input.open(fileName.c_str(), std::ios::in);
	if (!input) 
	{
		cout << "Open Phrase Table " << fileName << " Error!" << endl;
		return 0;
	}
	if (!LM) 
	{
		cout << "LM is NULL!" << endl;
		return 0;
	}

//	clock_t oldclock, newclock;
//	oldclock = clock();
	long i = 0;
	double MAX;
	double MIN;
	char zero[2];
	sprintf(zero, "%d", 0);
	string ZERO(zero);
	while (getline(input, strTmp)) 
	{
		vector<string> vecTmp;
		vector<string> vecWords;
	//	ePhrase ePhraseTmp;
		int firstSymbol = strTmp.find_first_of("|||");
		int lastSymbol = strTmp.find_last_of("|||");
		if ((firstSymbol == string::npos) || (lastSymbol == string::npos)) {
			continue ;
		}
		string fPhrase(strTmp, 0, firstSymbol - 1);
		string ePhrase(strTmp, firstSymbol + 4, lastSymbol - firstSymbol - 7);
		vector<int> fPhraseVec;
		vector<int> ePhraseVec;
		CNVOCAB->getIndices(fPhrase, fPhraseVec);
		ENVOCAB->getIndices(ePhrase, ePhraseVec);
		//ePhraseTmp.eWords = ePhrase;
		string probs(strTmp, lastSymbol + 2, strTmp.length() - lastSymbol - 1);
		split(probs, vecTmp);
		if (vecTmp.size() == 4) {
			vecTmp.push_back(ZERO);
		}
		double TMcost = WPROBEF * log(atof(vecTmp[0].c_str())) + WLEXEF * log(atof(vecTmp[1].c_str())) + \
			WPROBFE* log(atof(vecTmp[2].c_str())) + WLEXFE * log(atof(vecTmp[3].c_str())) + \
			WPENALTY * log(atof(vecTmp[4].c_str()));

//		split(ePhrase, vecWords);
		double LMcost = wordsProb(ePhraseVec);
		aboutEPhrase *ePhraseTmp = new aboutEPhrase();
		ePhraseTmp->c = TMcost + LMcost;
		ePhraseTmp->ephrase = ePhraseVec;
		ePhraseTmp->pC = TMcost;
	
		phraseTable::iterator posPT = f2eVocab.find(fPhraseVec);
		
		if (posPT != f2eVocab.end()) 
		{
		//	f2eVocab[fPhrase].push_back(ePhraseTmp);
			int len = posPT->second.size();
			if (len < TTABLELIMIT) {
				f2eVocab[fPhraseVec].push_back(ePhraseTmp);
				if ((ePhraseTmp->c - MAX > avs)) {
					MAX = ePhraseTmp->c;
				}
				if ((ePhraseTmp->c - MIN < avs)) {
					MIN = ePhraseTmp->c;
				}
			}
			else {
				MIN = (**min_element(posPT->second.begin(), posPT->second.end(), LESS())).c;
				if ((ePhraseTmp->c - MIN > avs)) {
					double tmp = MIN;
					if ((ePhraseTmp->c - MAX > avs)) {
						MAX = ePhraseTmp->c;
					}
					for (int i = 0; i < TTABLELIMIT; i++)
					{
						double test = (posPT->second)[i]->c;
						if (((posPT->second)[i]->c - tmp < avs) && ((posPT->second)[i]->c - tmp > -avs)) 
						{
							delete (posPT->second)[i];
							(posPT->second)[i] = ePhraseTmp;	
							break;
						}
					}
				}
			}
		}
		else
		{
			candiPhrase eAboutTmp;
			eAboutTmp.push_back(ePhraseTmp);
			f2eVocab.insert(make_pair(fPhraseVec, eAboutTmp));
			MAX = ePhraseTmp->c;
			MIN = ePhraseTmp->c;
		}

//		ePhraseTmp = NULL;
		vecWords.clear();
		vecTmp.clear();
	}
	input.clear();
	input.close();
	if(printmore) {
	cout << "Loading phrase table finished!!!" << endl;
	}
	return 1;
}

inline void TransOptions::split(const string& line, vector<string>& strs)
{
	istrstream ist(line.c_str());
	string w;
	while(ist>>w) strs.push_back(w);
}

void TransOptions::eraseSet(candiPhrase& forErase, int limit)
{
	stable_sort(forErase.begin(), forErase.end(), GREATER());
	forErase.erase(forErase.begin() + limit, forErase.end());

}

int TransOptions::getEPhrase(vector<int> fPhrase, PhraseSnippet& phraseSnippet, PhraseSnippetPosition& phraseSnippetPosition)
{
	int len = fPhrase.size();

	futureCost.resize(len);
	for(int ifc = 0; ifc < len; ifc++)
	{
		futureCost[ifc].resize(len);
	}
	
	for(int initI = 0; initI < len; initI++)
	{
		for(int initJ = initI; initJ < len; initJ++)
		{
			futureCost[initI][initJ] = INFINITE;
		}
	}
//获取TO以及future cost

	for(int i = 0; i <= PHRASELIMIT; i++)
	{
		for(int j = 0; (j < len) && (j + i < len); j++)
		{
			vector<int> fPhraseSnippet;
			fPosition posTmp;

			posTmp.start = j;
			posTmp.end = j + i;
			
			if (j == j + i) {
				fPhraseSnippet.push_back(fPhrase[j]);
			}
			else
			{
				for(int iTmp = j; iTmp <= j + i; iTmp++ )
				{
					fPhraseSnippet.push_back(fPhrase[iTmp]);
				}
			}
			
			if (fPhraseSnippet.size() <= 0) {
				continue;
			}
			phraseTable::iterator findT = f2eVocab.find(fPhraseSnippet);
			if (findT != f2eVocab.end()) 
			{
				int si = (findT->second).size();
				phraseSnippet.push_back(&(findT->second));
				phraseSnippetPosition.push_back(posTmp);
				futureCost[j][j + i] = (**max_element((findT->second).begin(), (findT->second).end(), LESS())).c;
			}
			else if ((findT == f2eVocab.end()) && (j == j + i)) 
			{
				aboutEPhrase *aboute = new aboutEPhrase;
				candiPhrase *cp = new candiPhrase;
				
				if (fPhraseSnippet[0] > 0) {
					int ids = --Vocab::ID;
					string cn = CNVOCAB->getWord(fPhraseSnippet[0]);
					CNVOCAB->unkTMP.insert(make_pair(ids, cn));
					ENVOCAB->unkTMP.insert(make_pair(ids, cn));
					fPhraseSnippet.clear();
					fPhraseSnippet.push_back(ids);
				}
				aboute->ephrase = fPhraseSnippet;
				aboute->pC = 0;
				aboute->c = wordsProb(fPhraseSnippet);
				cp->push_back(aboute);
				phraseSnippet.push_back(cp);
				phraseSnippetPosition.push_back(posTmp);
				futureCost[j][j + i] = (**max_element(cp->begin(), cp->end(), LESS())).c;
			}
		}
	}
	reCalculation();
	if(printmore) {
		cout << "print translation options ... " << endl;
		int lenTO = phraseSnippet.size();
		for(int iTO = 0; iTO < lenTO; iTO++)
		{
			candiPhrase::iterator pos;
			int lenT = phraseSnippet[iTO]->size();
			int fWordsLen = phraseSnippetPosition[iTO].end - phraseSnippetPosition[iTO].start + 1;
			vector<int> fWords;
			
			for(int position = phraseSnippetPosition[iTO].start; position <= phraseSnippetPosition[iTO].end; position++)
			{
				fWords.push_back(fPhrase[position]);
			}
			
			cout << "[ " << CNVOCAB->getWords(fWords) << " ]\t" << lenT <<endl;
			for(pos = phraseSnippet[iTO]->begin(); pos != phraseSnippet[iTO]->end(); ++pos)
			{
				cout << "\t" << ENVOCAB->getWords((*pos)->ephrase) << ", " << (*pos)->pC << ", " << (*pos)->c << endl;
			}
			fWords.clear();
		}
		
		cout << "print future cost" << endl;
		int lena = futureCost.size();
		for(int ia = 0; ia < lena; ia++)
		{
			for(int j = ia; j < lena; j++)
			{
				cout << "future costs from " << ia << " to " << j << " is " << futureCost[ia][j] << endl;
			}
		}
		cout << "Get translation options finished !" << endl;
	}
	return len;
}

void TransOptions::reCalculation()
{
	int len = futureCost.size();
	for(int l = 1; l < len; l++)
	{
		for(int i = 0; i < len - l; i++)
		{
			int j = i + l;
			for(int k = i; k < j; k++)
			{
				double q = futureCost[i][k] + futureCost[k + 1][j];
				if (q - futureCost[i][j] > avs) {
					futureCost[i][j] = q;
				}
			}
		}
	}
}

double TransOptions::getFutureCost(int from, int to)
{
	if (from <= to) {
		return futureCost[from][to];
	}
	return 0.0;
}

void TransOptions::printFC()
{
	int len = futureCost.size();
	for(int i = 0; i < len; i++)
	{
		for(int j = i; j < len; j++)
		{
			cout << "future costs from " << i << " to " << j << " is " << futureCost[i][j] << endl;
		}
	}
}

double TransOptions::wordsProb(vector<int> Ephrase)
{
	int len = Ephrase.size();
	double lmscore = 0;
	vector<int>::iterator pos = Ephrase.end();
	for(int i = 0 ; i < len; i++)
	{
		double lmTmp = LM->wordProb(Ephrase) * WLM;

		lmscore += lmTmp;
		Ephrase.erase(--pos);
	}
	return lmscore + WEIGHTW * len * (-1);
}

TransOptions::~TransOptions()
{
	phraseTable::iterator pos;
	for(pos = f2eVocab.begin(); pos != f2eVocab.end(); ++pos)
	{
		candiPhrase::iterator posi;
		for(posi = pos->second.begin(); posi != pos->second.end(); ++posi)
		{
			delete *posi;
		}
	}
}
transoptions.cpp - 源码说明

本页面展示了「解码器是基于短语的统计机器翻译系统的核心模块」中的 transoptions.cpp 源码文件，采用 C++ 编程语言编写，共 347 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与解码器相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?