staticdata.cpp.svn-base

来自「解码器是基于短语的统计机器翻译系统的核心模块」· SVN-BASE 代码 · 共 746 行 · 第 1/2 页

SVN-BASE
746
字号
// $Id$// vim:tabstop=2/***********************************************************************Moses - factored phrase-based language decoderCopyright (C) 2006 University of EdinburghThis library is free software; you can redistribute it and/ormodify it under the terms of the GNU Lesser General PublicLicense as published by the Free Software Foundation; eitherversion 2.1 of the License, or (at your option) any later version.This library is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNULesser General Public License for more details.You should have received a copy of the GNU Lesser General PublicLicense along with this library; if not, write to the Free SoftwareFoundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA***********************************************************************/#include <string>#include <cassert>#include "PhraseDictionaryMemory.h"#include "DecodeStepTranslation.h"#include "DecodeStepGeneration.h"#include "GenerationDictionary.h"#include "DummyScoreProducers.h"#include "StaticData.h"#include "Util.h"#include "FactorCollection.h"#include "HypothesisCollection.h"#include "Timer.h"#include "LanguageModelSingleFactor.h"#include "LanguageModelMultiFactor.h"#include "LanguageModelFactory.h"#include "LexicalReordering.h"#include "SentenceStats.h"#include "PhraseDictionaryTreeAdaptor.h"#include "UserMessage.h"using namespace std;static size_t CalcMax(size_t x, const vector<size_t>& y) {  size_t max = x;  for (vector<size_t>::const_iterator i=y.begin(); i != y.end(); ++i)    if (*i > max) max = *i;  return max;}static size_t CalcMax(size_t x, const vector<size_t>& y, const vector<size_t>& z) {  size_t max = x;  for (vector<size_t>::const_iterator i=y.begin(); i != y.end(); ++i)    if (*i > max) max = *i;  for (vector<size_t>::const_iterator i=z.begin(); i != z.end(); ++i)    if (*i > max) max = *i;  return max;}StaticData* StaticData::s_instance(0);StaticData::StaticData():m_fLMsLoaded(false),m_inputType(0),m_numInputScores(0),m_distortionScoreProducer(0),m_wpProducer(0),m_useDistortionFutureCosts(false),m_isDetailedTranslationReportingEnabled(false) ,m_onlyDistinctNBest(false),m_computeLMBackoffStats(false),m_factorDelimiter("|") // default delimiter between factors{  m_maxFactorIdx[0] = 0;  // source side  m_maxFactorIdx[1] = 0;  // target side	s_instance = this;	// memory pools	Phrase::InitializeMemPool();}bool StaticData::LoadData(Parameter *parameter){	ResetUserTime();	m_parameter = parameter;		// verbose level	m_verboseLevel = 1;	if (m_parameter->GetParam("verbose").size() == 1)  {	m_verboseLevel = Scan<size_t>( m_parameter->GetParam("verbose")[0]);  }	// input type has to be specified BEFORE loading the phrase tables!	if(m_parameter->GetParam("inputtype").size()) 		m_inputType=Scan<int>(m_parameter->GetParam("inputtype")[0]);	VERBOSE(2,"input type is: "<<(m_inputType?"confusion net":"text input")<<"\n");	// factor delimiter	if (m_parameter->GetParam("factor-delimiter").size() > 0) {		m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];	}	// n-best	if (m_parameter->GetParam("n-best-list").size() >= 2)	{		m_nBestFilePath = m_parameter->GetParam("n-best-list")[0];		m_nBestSize = Scan<size_t>( m_parameter->GetParam("n-best-list")[1] );		m_onlyDistinctNBest=(m_parameter->GetParam("n-best-list").size()>2 && m_parameter->GetParam("n-best-list")[2]=="distinct");	}	else	{		m_nBestSize = 0;	}		// include feature names in the n-best list	SetBooleanParameter( &m_labeledNBestList, "labeled-n-best-list", true );	// printing source phrase spans	SetBooleanParameter( &m_reportSegmentation, "report-segmentation", false );	// print all factors of output translations	SetBooleanParameter( &m_reportAllFactors, "report-all-factors", false );	//input factors	const vector<string> &inputFactorVector = m_parameter->GetParam("input-factors");	for(size_t i=0; i<inputFactorVector.size(); i++) 	{		m_inputFactorOrder.push_back(Scan<FactorType>(inputFactorVector[i]));	}	if(m_inputFactorOrder.empty())	{		UserMessage::Add(string("no input factor specified in config file"));		return false;	}	//output factors	const vector<string> &outputFactorVector = m_parameter->GetParam("output-factors");	for(size_t i=0; i<outputFactorVector.size(); i++) 	{		m_outputFactorOrder.push_back(Scan<FactorType>(outputFactorVector[i]));	}	if(m_outputFactorOrder.empty())	{ // default. output factor 0		m_outputFactorOrder.push_back(0);	}	//source word deletion	SetBooleanParameter( &m_wordDeletionEnabled, "phrase-drop-allowed", false );	// additional output	SetBooleanParameter( &m_isDetailedTranslationReportingEnabled, 			     "translation-details", false );	SetBooleanParameter( &m_computeLMBackoffStats, "lmstats", false );	if (m_computeLMBackoffStats && 	    ! m_isDetailedTranslationReportingEnabled) {	  TRACE_ERR( "-lmstats implies -translation-details, enabling" << std::endl);	  m_isDetailedTranslationReportingEnabled = true;	}	// score weights	const vector<string> distortionWeights = m_parameter->GetParam("weight-d");		m_weightDistortion				= Scan<float>(distortionWeights[0]);	m_weightWordPenalty				= Scan<float>( m_parameter->GetParam("weight-w")[0] );	m_distortionScoreProducer = new DistortionScoreProducer;	m_allWeights.push_back(m_weightDistortion);	m_wpProducer = new WordPenaltyProducer;	m_allWeights.push_back(m_weightWordPenalty);	// misc	m_maxHypoStackSize = (m_parameter->GetParam("stack").size() > 0)				? Scan<size_t>(m_parameter->GetParam("stack")[0]) : DEFAULT_MAX_HYPOSTACK_SIZE;	m_maxDistortion = (m_parameter->GetParam("distortion-limit").size() > 0) ?		Scan<int>(m_parameter->GetParam("distortion-limit")[0])		: -1;	m_useDistortionFutureCosts = (m_parameter->GetParam("use-distortion-future-costs").size() > 0) 		? Scan<bool>(m_parameter->GetParam("use-distortion-future-costs")[0]) : false;	//TRACE_ERR( "using distortion future costs? "<<UseDistortionFutureCosts()<<"\n");		m_beamThreshold = (m_parameter->GetParam("beam-threshold").size() > 0) ?		TransformScore(Scan<float>(m_parameter->GetParam("beam-threshold")[0]))		: TransformScore(DEFAULT_BEAM_THRESHOLD);	m_maxNoTransOptPerCoverage = (m_parameter->GetParam("max-trans-opt-per-coverage").size() > 0)				? Scan<size_t>(m_parameter->GetParam("max-trans-opt-per-coverage")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE;	//TRACE_ERR( "max translation options per coverage span: "<<m_maxNoTransOptPerCoverage<<"\n");	m_maxNoPartTransOpt = (m_parameter->GetParam("max-partial-trans-opt").size() > 0)				? Scan<size_t>(m_parameter->GetParam("max-partial-trans-opt")[0]) : DEFAULT_MAX_PART_TRANS_OPT_SIZE;	// Unknown Word Processing -- wade	//TODO replace this w/general word dropping -- EVH	SetBooleanParameter( &m_dropUnknown, "drop-unknown", false );	if (!LoadLexicalReorderingModel()) return false;	if (!LoadLanguageModels()) return false;	if (!LoadGenerationTables()) return false;	if (!LoadPhraseTables()) return false;	if (!LoadMapping()) return false;	return true;}void StaticData::SetBooleanParameter( bool *parameter, string parameterName, bool defaultValue ) {  // default value if nothing is specified  *parameter = defaultValue;  if (! m_parameter->isParamSpecified( parameterName ) )  {    return;  }  // if parameter is just specified as, e.g. "-parameter" set it true  if (m_parameter->GetParam( parameterName ).size() == 0)   {    *parameter = true;  }  // if paramter is specified "-parameter true" or "-parameter false"  else if (m_parameter->GetParam( parameterName ).size() == 1)   {    *parameter = Scan<bool>( m_parameter->GetParam( parameterName )[0]);  }}StaticData::~StaticData(){	delete m_parameter;	RemoveAllInColl(m_phraseDictionary);	RemoveAllInColl(m_generationDictionary);	RemoveAllInColl(m_languageModel);	RemoveAllInColl(m_decodeStepList);	RemoveAllInColl(m_reorderModels);		// small score producers	delete m_distortionScoreProducer;	delete m_wpProducer;	// memory pools	Phrase::FinalizeMemPool();}bool StaticData::LoadLexicalReorderingModel(){	// load Lexical Reordering model		//distortion weights	const vector<string> distortionWeights = m_parameter->GetParam("weight-d");		//distortional model weights (first weight is distance distortion)	std::vector<float> distortionModelWeights; 	for(size_t dist=1; dist < distortionWeights.size(); dist++) 	{ 		distortionModelWeights.push_back(Scan<float>(distortionWeights[dist])); 	}	const vector<string> &lrFileVector = 		m_parameter->GetParam("distortion-file");		for(unsigned int i=0; i< lrFileVector.size(); i++ ) //loops for each distortion model	{		vector<string> specification = Tokenize<string>(lrFileVector[i]," ");			if (specification.size() != 4 )			{			  TRACE_ERR("ERROR: Expected format 'factors type weight-count filePath' in specification of distortion file " << i << std::endl << lrFileVector[i] << std::endl);			  return false;			}	  		//defaults, but at least one of these per model should be explicitly specified in the .ini file		int orientation = DistortionOrientationType::Msd, 		  direction = LexReorderType::Backward,		  condition = LexReorderType::Fe;		//Loop through, overriding defaults with specifications		vector<string> parameters = Tokenize<string>(specification[1],"-");		for (size_t param=0; param<parameters.size(); param++)		{			string val = ToLower(parameters[param]);			//orientation 			if(val == "monotone" || val == "monotonicity")				orientation = DistortionOrientationType::Monotone; 			else if(val == "msd" || val == "orientation")				orientation = DistortionOrientationType::Msd;			//direction			else if(val == "forward")				direction = LexReorderType::Forward;			else if(val == "backward" || val == "unidirectional")				direction = LexReorderType::Backward; 			else if(val == "bidirectional")				direction = LexReorderType::Bidirectional;			//condition			else if(val == "f")				condition = LexReorderType::F; 			else if(val == "fe")				condition = LexReorderType::Fe; 			//unknown specification			else {			  TRACE_ERR("ERROR: Unknown orientation type specification '" << val << "'" << endl);			  return false;			}			if (orientation == DistortionOrientationType::Msd) 				m_sourceStartPosMattersForRecombination = true;		}  		//compute the number of weights that ought to be in the table from this		size_t numWeightsInTable = 0;		if(orientation == DistortionOrientationType::Monotone)		{			numWeightsInTable = 2;		}		else		{			numWeightsInTable = 3;		}		if(direction == LexReorderType::Bidirectional)		{			numWeightsInTable *= 2;		}		size_t specifiedNumWeights = Scan<size_t>(specification[2]);		if (specifiedNumWeights != numWeightsInTable) 		{			stringstream strme;		  strme << "specified number of weights (" 			    << specifiedNumWeights 			    << ") does not match correct number of weights for this type (" 			    << numWeightsInTable << std::endl;		  UserMessage::Add(strme.str());    }		//factors involved in this table		vector<string> inputfactors = Tokenize(specification[0],"-");		vector<FactorType> 	input,output;		if(inputfactors.size() > 1)		{			input	= Tokenize<FactorType>(inputfactors[0],",");			output= Tokenize<FactorType>(inputfactors[1],",");		}		else		{			input.push_back(0); // default, just in case the user is actually using a bidirectional model			output = Tokenize<FactorType>(inputfactors[0],",");		}		std::vector<float> m_lexWeights; 			//will store the weights for this particular distortion reorderer		std::vector<float> newLexWeights;     //we'll remove the weights used by this distortion reorder, leaving the weights yet to be used		if(specifiedNumWeights == 1) // this is useful if the user just wants to train one weight for the model		{			//add appropriate weight to weight vector			assert(distortionModelWeights.size()> 0); //if this fails the user has not specified enough weights			float wgt = distortionModelWeights[0];			for(size_t i=0; i<numWeightsInTable; i++)			{				m_lexWeights.push_back(wgt);			}			//update the distortionModelWeight vector to remove these weights			std::vector<float> newLexWeights; //plus one as the first weight should always be distance-distortion			for(size_t i=1; i<distortionModelWeights.size(); i++)			{				newLexWeights.push_back(distortionModelWeights[i]);			}			distortionModelWeights = newLexWeights;		}		else		{			//add appropriate weights to weight vector			for(size_t i=0; i< numWeightsInTable; i++)			{				assert(i < distortionModelWeights.size()); //if this fails the user has not specified enough weights				m_lexWeights.push_back(distortionModelWeights[i]);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?