translationoptioncollection.cpp.svn-base

来自「moses开源的机器翻译系统」· SVN-BASE 代码 · 共 593 行 · 第 1/2 页

SVN-BASE
593
字号
// $Id$// vim:tabstop=2/***********************************************************************Moses - factored phrase-based language decoderCopyright (C) 2006 University of EdinburghThis library is free software; you can redistribute it and/ormodify it under the terms of the GNU Lesser General PublicLicense as published by the Free Software Foundation; eitherversion 2.1 of the License, or (at your option) any later version.This library is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNULesser General Public License for more details.You should have received a copy of the GNU Lesser General PublicLicense along with this library; if not, write to the Free SoftwareFoundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA***********************************************************************/#include <algorithm>#include "TranslationOptionCollection.h"#include "Sentence.h"#include "DecodeStep.h"#include "LanguageModel.h"#include "PhraseDictionaryMemory.h"#include "FactorCollection.h"#include "InputType.h"#include "Util.h"#include "StaticData.h"#include "DecodeStepTranslation.h"#include "DecodeGraph.h"using namespace std;/** constructor; since translation options are indexed by coverage span, the corresponding data structure is initialized here 	* This fn should be called by inherited classes*/TranslationOptionCollection::TranslationOptionCollection(InputType const& src, size_t maxNoTransOptPerCoverage)	: m_source(src)	,m_futureScore(src.GetSize())	,m_maxNoTransOptPerCoverage(maxNoTransOptPerCoverage){	// create 2-d vector	size_t size = src.GetSize();	for (size_t startPos = 0 ; startPos < size ; ++startPos)	{		m_collection.push_back( vector< TranslationOptionList >() );    size_t maxSize = size - startPos + 1;    size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();    maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase;		for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos)		{			m_collection[startPos].push_back( TranslationOptionList() );		}	}}/** destructor, clears out data structures */TranslationOptionCollection::~TranslationOptionCollection(){	size_t size = m_source.GetSize();	for (size_t startPos = 0 ; startPos < size ; ++startPos)	{    size_t maxSize = size - startPos + 1;    size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();    maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase;		for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos)		{		 RemoveAllInColl(GetTranslationOptionList(startPos, endPos));		}	}	RemoveAllInColl(m_unksrcs);}/** helper for pruning */bool CompareTranslationOption(const TranslationOption *a, const TranslationOption *b){	return a->GetFutureScore() > b->GetFutureScore();}void TranslationOptionCollection::Prune(){		size_t size = m_source.GetSize();		// prune to max no. of trans opt	if (m_maxNoTransOptPerCoverage == 0)		return;	size_t total = 0;	size_t totalPruned = 0;	for (size_t startPos = 0 ; startPos < size; ++startPos)	{    size_t maxSize = size - startPos;    size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();    maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase;		for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos)		{			TranslationOptionList &fullList = GetTranslationOptionList(startPos, endPos);			total += fullList.size();			if (fullList.size() <= m_maxNoTransOptPerCoverage)				continue;						// sort in vector			nth_element(fullList.begin(), fullList.begin() + m_maxNoTransOptPerCoverage, fullList.end(), CompareTranslationOption);			totalPruned += fullList.size() - m_maxNoTransOptPerCoverage;						// delete the rest			for (size_t i = m_maxNoTransOptPerCoverage ; i < fullList.size() ; ++i)			{				delete fullList[i];			}			fullList.resize(m_maxNoTransOptPerCoverage);		}	}	VERBOSE(2,"       Total translation options: " << total << std::endl		<< "Total translation options pruned: " << totalPruned << std::endl);}/** Force a creation of a translation option where there are none for a particular source position.* ie. where a source word has not been translated, create a translation option by*				1. not observing the table limits on phrase/generation tables*				2. using the handler ProcessUnknownWord()* Call this function once translation option collection has been filled with translation options** This function calls for unknown words is complicated by the fact it must handle different input types. * The call stack is*		Base::ProcessUnknownWord()*			Inherited::ProcessUnknownWord(position)*				Base::ProcessOneUnknownWord()** \param decodeStepList list of decoding steps* \param factorCollection input sentence with all factors*/void TranslationOptionCollection::ProcessUnknownWord(const std::vector <DecodeGraph*> &decodeStepVL){	size_t size = m_source.GetSize();	// try to translation for coverage with no trans by expanding table limit	for (size_t startVL = 0 ; startVL < decodeStepVL.size() ; startVL++) 	{	  const DecodeGraph &decodeStepList = *decodeStepVL[startVL];		for (size_t pos = 0 ; pos < size ; ++pos)		{				TranslationOptionList &fullList = GetTranslationOptionList(pos, pos);				size_t numTransOpt = fullList.size();				if (numTransOpt == 0)				{					CreateTranslationOptionsForRange(decodeStepList																				, pos, pos, false);				}		}	}			bool alwaysCreateDirectTranslationOption = StaticData::Instance().IsAlwaysCreateDirectTranslationOption();	// create unknown words for 1 word coverage where we don't have any trans options	for (size_t pos = 0 ; pos < size ; ++pos)	{		TranslationOptionList &fullList = GetTranslationOptionList(pos, pos);		if (fullList.size() == 0 || alwaysCreateDirectTranslationOption)			ProcessUnknownWord(pos);	}}/** special handling of ONE unknown words. Either add temporarily add word to translation table,	* or drop the translation.	* This function should be called by the ProcessOneUnknownWord() in the inherited class	* At the moment, this unknown word handler is a bit of a hack, if copies over each factor from source	* to target word, or uses the 'UNK' factor.	* Ideally, this function should be in a class which can be expanded upon, for example, 	* to create a morphologically aware handler. 	*	* \param sourceWord the unknown word	* \param sourcePos	* \param factorCollection input sentence with all factors */void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord,																														size_t sourcePos, size_t length){	// unknown word, add as trans opt	FactorCollection &factorCollection = FactorCollection::Instance();	size_t isDigit = 0;	if (StaticData::Instance().GetDropUnknown())	{		const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface		const string &s = f->GetString();		isDigit = s.find_first_of("0123456789");		if (isDigit == string::npos) 			isDigit = 0;		else 			isDigit = 1;		// modify the starting bitmap	}	Phrase* m_unksrc = new Phrase(Input); m_unksrc->AddWord() = sourceWord;	m_unksrcs.push_back(m_unksrc);		TranslationOption *transOpt;	if (! StaticData::Instance().GetDropUnknown() || isDigit)	{		// add to dictionary		TargetPhrase targetPhrase(Output);		Word &targetWord = targetPhrase.AddWord();							for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++)		{			FactorType factorType = static_cast<FactorType>(currFactor);						const Factor *sourceFactor = sourceWord[currFactor];			if (sourceFactor == NULL)				targetWord[factorType] = factorCollection.AddFactor(Output, factorType, UNKNOWN_FACTOR);			else				targetWord[factorType] = factorCollection.AddFactor(Output, factorType, sourceFactor->GetString());		}		targetPhrase.SetScore();		targetPhrase.SetSourcePhrase(m_unksrc);		transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos + length - 1), targetPhrase, m_source, 0);		}	else 	{ // drop source word. create blank trans opt		TargetPhrase targetPhrase(Output);		targetPhrase.SetSourcePhrase(m_unksrc);		transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos + length - 1), targetPhrase, m_source, 0);	}	transOpt->CalcScore();	Add(transOpt);}/** compute future score matrix in a dynamic programming fashion.	* This matrix used in search.	* Call this function once translation option collection has been filled with translation options*/void TranslationOptionCollection::CalcFutureScore(){  // setup the matrix (ignore lower triangle, set upper triangle to -inf  size_t size = m_source.GetSize(); // the width of the matrix  for(size_t row=0; row<size; row++) {    for(size_t col=row; col<size; col++) {      m_futureScore.SetScore(row, col, -numeric_limits<float>::infinity());    }  }  // walk all the translation options and record the cheapest option for each span	for (size_t startPos = 0 ; startPos < m_source.GetSize() ; ++startPos)	{    size_t maxSize = m_source.GetSize() - startPos;    size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();    maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase;		for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos)		{			TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);			TranslationOptionList::const_iterator iterTransOpt;			for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) 			{				const TranslationOption &transOpt = **iterTransOpt;				float score = transOpt.GetFutureScore();				if (score > m_futureScore.GetScore(startPos, endPos))					m_futureScore.SetScore(startPos, endPos, score);			}		}	}  // now fill all the cells in the strictly upper triangle  //   there is no way to modify the diagonal now, in the case  //   where no translation option covers a single-word span,  //   we leave the +inf in the matrix  // like in chart parsing we want each cell to contain the highest score  // of the full-span trOpt or the sum of scores of joining two smaller spans	for(size_t colstart = 1; colstart < size ; colstart++) {		for(size_t diagshift = 0; diagshift < size-colstart ; diagshift++) {            size_t startPos = diagshift;            size_t endPos = colstart+diagshift;			for(size_t joinAt = startPos; joinAt < endPos ; joinAt++)  {              float joinedScore = m_futureScore.GetScore(startPos, joinAt)                                + m_futureScore.GetScore(joinAt+1, endPos);              /* // uncomment to see the cell filling scheme              TRACE_ERR( "[" <<startPos<<","<<endPos<<"] <-? ["<<startPos<<","<<joinAt<<"]+["<<joinAt+1<<","<<endPos                << "] (colstart: "<<colstart<<", diagshift: "<<diagshift<<")"<<endl);              */              if (joinedScore > m_futureScore.GetScore(startPos, endPos))                m_futureScore.SetScore(startPos, endPos, joinedScore);            }        }

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?