translationoptioncollection.cpp.svn-base

来自「解码器是基于短语的统计机器翻译系统的核心模块」· SVN-BASE 代码 · 共 460 行 · 第 1/2 页

SVN-BASE
460
字号
// $Id$// vim:tabstop=2/***********************************************************************Moses - factored phrase-based language decoderCopyright (C) 2006 University of EdinburghThis library is free software; you can redistribute it and/ormodify it under the terms of the GNU Lesser General PublicLicense as published by the Free Software Foundation; eitherversion 2.1 of the License, or (at your option) any later version.This library is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNULesser General Public License for more details.You should have received a copy of the GNU Lesser General PublicLicense along with this library; if not, write to the Free SoftwareFoundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA***********************************************************************/#include <algorithm>#include "TranslationOptionCollection.h"#include "Sentence.h"#include "DecodeStep.h"#include "LanguageModel.h"#include "PhraseDictionaryMemory.h"#include "FactorCollection.h"#include "InputType.h"#include "Util.h"#include "StaticData.h"using namespace std;/** constructor; since translation options are indexed by coverage span, the corresponding data structure is initialized here 	* This fn should be called by inherited classes*/TranslationOptionCollection::TranslationOptionCollection(InputType const& src, size_t maxNoTransOptPerCoverage)	: m_source(src)	,m_futureScore(src.GetSize())	,m_maxNoTransOptPerCoverage(maxNoTransOptPerCoverage){	// create 2-d vector	size_t size = src.GetSize();	for (size_t startPos = 0 ; startPos < size ; ++startPos)	{		m_collection.push_back( vector< TranslationOptionList >() );		for (size_t endPos = startPos ; endPos < size ; ++endPos)		{			m_collection[startPos].push_back( TranslationOptionList() );		}	}}/** destructor, clears out data structures */TranslationOptionCollection::~TranslationOptionCollection(){	// delete all trans opt	size_t size = m_source.GetSize();	for (size_t startPos = 0 ; startPos < size ; ++startPos)	{		for (size_t endPos = startPos ; endPos < size ; ++endPos)		{		 RemoveAllInColl(GetTranslationOptionList(startPos, endPos));		}	}}/** helper for pruning */bool CompareTranslationOption(const TranslationOption *a, const TranslationOption *b){	return a->GetFutureScore() > b->GetFutureScore();}void TranslationOptionCollection::Prune(){		size_t size = m_source.GetSize();		// prune to max no. of trans opt	if (m_maxNoTransOptPerCoverage == 0)		return;	size_t total = 0;	size_t totalPruned = 0;	for (size_t startPos = 0 ; startPos < size ; ++startPos)	{		for (size_t endPos = startPos ; endPos < size ; ++endPos)		{			TranslationOptionList &fullList = GetTranslationOptionList(startPos, endPos);			total += fullList.size();			if (fullList.size() <= m_maxNoTransOptPerCoverage)				continue;						// sort in vector			nth_element(fullList.begin(), fullList.begin() + m_maxNoTransOptPerCoverage, fullList.end(), CompareTranslationOption);			totalPruned += fullList.size() - m_maxNoTransOptPerCoverage;						// delete the rest			for (size_t i = m_maxNoTransOptPerCoverage ; i < fullList.size() ; ++i)			{				delete fullList[i];			}			fullList.resize(m_maxNoTransOptPerCoverage);		}	}	VERBOSE(2,"       Total translation options: " << total << std::endl		<< "Total translation options pruned: " << totalPruned << std::endl);}/** Force a creation of a translation option where there are none for a particular source position.* ie. where a source word has not been translated, create a translation option by*				1. not observing the table limits on phrase/generation tables*				2. using the handler ProcessUnknownWord()* Call this function once translation option collection has been filled with translation options** This function calls for unknown words is complicated by the fact it must handle different input types. * The call stack is*		Base::ProcessUnknownWord()*			Inherited::ProcessUnknownWord(position)*				Base::ProcessOneUnknownWord()** \param decodeStepList list of decoding steps* \param factorCollection input sentence with all factors*/void TranslationOptionCollection::ProcessUnknownWord(const std::list < DecodeStep* > &decodeStepList, FactorCollection &factorCollection){	size_t size = m_source.GetSize();	// try to translation for coverage with no trans by expanding table limit	for (size_t pos = 0 ; pos < size ; ++pos)	{			TranslationOptionList &fullList = GetTranslationOptionList(pos, pos);			size_t numTransOpt = fullList.size();			if (numTransOpt == 0)			{				CreateTranslationOptionsForRange(decodeStepList, factorCollection																			, pos, pos, false);			}	}			// create unknown words for 1 word coverage where we don't have any trans options	vector<bool> process(size);	fill(process.begin(), process.end(), true);		for (size_t startPos = 0 ; startPos < size ; ++startPos)	{		for (size_t endPos = startPos ; endPos < size ; ++endPos)		{			TranslationOptionList &fullList = GetTranslationOptionList(startPos, endPos);			size_t numTransOpt = fullList.size();			if (numTransOpt > 0)			{				fill(process.begin() + startPos, process.begin() + endPos + 1, false);			}		}		}				for (size_t currPos = 0 ; currPos < size ; ++currPos)	{		if (process[currPos])			ProcessUnknownWord(currPos, *m_factorCollection);	}}/** special handling of ONE unknown words. Either add temporarily add word to translation table,	* or drop the translation.	* This function should be called by the ProcessOneUnknownWord() in the inherited class	* At the moment, this unknown word handler is a bit of a hack, if copies over each factor from source	* to target word, or uses the 'UNK' factor.	* Ideally, this function should be in a class which can be expanded upon, for example, 	* to create a morphologically aware handler. 	*	* \param sourceWord the unknown word	* \param sourcePos	* \param factorCollection input sentence with all factors */void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord,																														size_t sourcePos																												, FactorCollection &factorCollection){	// unknown word, add as trans opt		size_t isDigit = 0;		if (StaticData::Instance()->GetDropUnknown())		{			const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface			const string &s = f->GetString();			isDigit = s.find_first_of("0123456789");			if (isDigit == string::npos) 				isDigit = 0;			else 				isDigit = 1;			// modify the starting bitmap		}				TranslationOption *transOpt;		if (! StaticData::Instance()->GetDropUnknown() || isDigit)		{			// add to dictionary			TargetPhrase targetPhrase(Output);			Word &targetWord = targetPhrase.AddWord();									for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++)			{				FactorType factorType = static_cast<FactorType>(currFactor);								const Factor *sourceFactor = sourceWord[currFactor];				if (sourceFactor == NULL)					targetWord[factorType] = factorCollection.AddFactor(Output, factorType, UNKNOWN_FACTOR);				else					targetWord[factorType] = factorCollection.AddFactor(Output, factorType, sourceFactor->GetString());			}				targetPhrase.SetScore();						transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos), targetPhrase, 0);		}		else 		{ // drop source word. create blank trans opt			const TargetPhrase targetPhrase(Output);			transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos), targetPhrase, 0);		}		transOpt->CalcScore();		Add(transOpt);}/** compute future score matrix in a dynamic programming fashion.

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?