staticdata.cpp.svn-base
来自「解码器是基于短语的统计机器翻译系统的核心模块」· SVN-BASE 代码 · 共 746 行 · 第 1/2 页
SVN-BASE
746 行
// $Id$// vim:tabstop=2/***********************************************************************Moses - factored phrase-based language decoderCopyright (C) 2006 University of EdinburghThis library is free software; you can redistribute it and/ormodify it under the terms of the GNU Lesser General PublicLicense as published by the Free Software Foundation; eitherversion 2.1 of the License, or (at your option) any later version.This library is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNULesser General Public License for more details.You should have received a copy of the GNU Lesser General PublicLicense along with this library; if not, write to the Free SoftwareFoundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA***********************************************************************/#include <string>#include <cassert>#include "PhraseDictionaryMemory.h"#include "DecodeStepTranslation.h"#include "DecodeStepGeneration.h"#include "GenerationDictionary.h"#include "DummyScoreProducers.h"#include "StaticData.h"#include "Util.h"#include "FactorCollection.h"#include "HypothesisCollection.h"#include "Timer.h"#include "LanguageModelSingleFactor.h"#include "LanguageModelMultiFactor.h"#include "LanguageModelFactory.h"#include "LexicalReordering.h"#include "SentenceStats.h"#include "PhraseDictionaryTreeAdaptor.h"#include "UserMessage.h"using namespace std;static size_t CalcMax(size_t x, const vector<size_t>& y) { size_t max = x; for (vector<size_t>::const_iterator i=y.begin(); i != y.end(); ++i) if (*i > max) max = *i; return max;}static size_t CalcMax(size_t x, const vector<size_t>& y, const vector<size_t>& z) { size_t max = x; for (vector<size_t>::const_iterator i=y.begin(); i != y.end(); ++i) if (*i > max) max = *i; for (vector<size_t>::const_iterator i=z.begin(); i != z.end(); ++i) if (*i > max) max = *i; return max;}StaticData* StaticData::s_instance(0);StaticData::StaticData():m_fLMsLoaded(false),m_inputType(0),m_numInputScores(0),m_distortionScoreProducer(0),m_wpProducer(0),m_useDistortionFutureCosts(false),m_isDetailedTranslationReportingEnabled(false) ,m_onlyDistinctNBest(false),m_computeLMBackoffStats(false),m_factorDelimiter("|") // default delimiter between factors{ m_maxFactorIdx[0] = 0; // source side m_maxFactorIdx[1] = 0; // target side s_instance = this; // memory pools Phrase::InitializeMemPool();}bool StaticData::LoadData(Parameter *parameter){ ResetUserTime(); m_parameter = parameter; // verbose level m_verboseLevel = 1; if (m_parameter->GetParam("verbose").size() == 1) { m_verboseLevel = Scan<size_t>( m_parameter->GetParam("verbose")[0]); } // input type has to be specified BEFORE loading the phrase tables! if(m_parameter->GetParam("inputtype").size()) m_inputType=Scan<int>(m_parameter->GetParam("inputtype")[0]); VERBOSE(2,"input type is: "<<(m_inputType?"confusion net":"text input")<<"\n"); // factor delimiter if (m_parameter->GetParam("factor-delimiter").size() > 0) { m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0]; } // n-best if (m_parameter->GetParam("n-best-list").size() >= 2) { m_nBestFilePath = m_parameter->GetParam("n-best-list")[0]; m_nBestSize = Scan<size_t>( m_parameter->GetParam("n-best-list")[1] ); m_onlyDistinctNBest=(m_parameter->GetParam("n-best-list").size()>2 && m_parameter->GetParam("n-best-list")[2]=="distinct"); } else { m_nBestSize = 0; } // include feature names in the n-best list SetBooleanParameter( &m_labeledNBestList, "labeled-n-best-list", true ); // printing source phrase spans SetBooleanParameter( &m_reportSegmentation, "report-segmentation", false ); // print all factors of output translations SetBooleanParameter( &m_reportAllFactors, "report-all-factors", false ); //input factors const vector<string> &inputFactorVector = m_parameter->GetParam("input-factors"); for(size_t i=0; i<inputFactorVector.size(); i++) { m_inputFactorOrder.push_back(Scan<FactorType>(inputFactorVector[i])); } if(m_inputFactorOrder.empty()) { UserMessage::Add(string("no input factor specified in config file")); return false; } //output factors const vector<string> &outputFactorVector = m_parameter->GetParam("output-factors"); for(size_t i=0; i<outputFactorVector.size(); i++) { m_outputFactorOrder.push_back(Scan<FactorType>(outputFactorVector[i])); } if(m_outputFactorOrder.empty()) { // default. output factor 0 m_outputFactorOrder.push_back(0); } //source word deletion SetBooleanParameter( &m_wordDeletionEnabled, "phrase-drop-allowed", false ); // additional output SetBooleanParameter( &m_isDetailedTranslationReportingEnabled, "translation-details", false ); SetBooleanParameter( &m_computeLMBackoffStats, "lmstats", false ); if (m_computeLMBackoffStats && ! m_isDetailedTranslationReportingEnabled) { TRACE_ERR( "-lmstats implies -translation-details, enabling" << std::endl); m_isDetailedTranslationReportingEnabled = true; } // score weights const vector<string> distortionWeights = m_parameter->GetParam("weight-d"); m_weightDistortion = Scan<float>(distortionWeights[0]); m_weightWordPenalty = Scan<float>( m_parameter->GetParam("weight-w")[0] ); m_distortionScoreProducer = new DistortionScoreProducer; m_allWeights.push_back(m_weightDistortion); m_wpProducer = new WordPenaltyProducer; m_allWeights.push_back(m_weightWordPenalty); // misc m_maxHypoStackSize = (m_parameter->GetParam("stack").size() > 0) ? Scan<size_t>(m_parameter->GetParam("stack")[0]) : DEFAULT_MAX_HYPOSTACK_SIZE; m_maxDistortion = (m_parameter->GetParam("distortion-limit").size() > 0) ? Scan<int>(m_parameter->GetParam("distortion-limit")[0]) : -1; m_useDistortionFutureCosts = (m_parameter->GetParam("use-distortion-future-costs").size() > 0) ? Scan<bool>(m_parameter->GetParam("use-distortion-future-costs")[0]) : false; //TRACE_ERR( "using distortion future costs? "<<UseDistortionFutureCosts()<<"\n"); m_beamThreshold = (m_parameter->GetParam("beam-threshold").size() > 0) ? TransformScore(Scan<float>(m_parameter->GetParam("beam-threshold")[0])) : TransformScore(DEFAULT_BEAM_THRESHOLD); m_maxNoTransOptPerCoverage = (m_parameter->GetParam("max-trans-opt-per-coverage").size() > 0) ? Scan<size_t>(m_parameter->GetParam("max-trans-opt-per-coverage")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE; //TRACE_ERR( "max translation options per coverage span: "<<m_maxNoTransOptPerCoverage<<"\n"); m_maxNoPartTransOpt = (m_parameter->GetParam("max-partial-trans-opt").size() > 0) ? Scan<size_t>(m_parameter->GetParam("max-partial-trans-opt")[0]) : DEFAULT_MAX_PART_TRANS_OPT_SIZE; // Unknown Word Processing -- wade //TODO replace this w/general word dropping -- EVH SetBooleanParameter( &m_dropUnknown, "drop-unknown", false ); if (!LoadLexicalReorderingModel()) return false; if (!LoadLanguageModels()) return false; if (!LoadGenerationTables()) return false; if (!LoadPhraseTables()) return false; if (!LoadMapping()) return false; return true;}void StaticData::SetBooleanParameter( bool *parameter, string parameterName, bool defaultValue ) { // default value if nothing is specified *parameter = defaultValue; if (! m_parameter->isParamSpecified( parameterName ) ) { return; } // if parameter is just specified as, e.g. "-parameter" set it true if (m_parameter->GetParam( parameterName ).size() == 0) { *parameter = true; } // if paramter is specified "-parameter true" or "-parameter false" else if (m_parameter->GetParam( parameterName ).size() == 1) { *parameter = Scan<bool>( m_parameter->GetParam( parameterName )[0]); }}StaticData::~StaticData(){ delete m_parameter; RemoveAllInColl(m_phraseDictionary); RemoveAllInColl(m_generationDictionary); RemoveAllInColl(m_languageModel); RemoveAllInColl(m_decodeStepList); RemoveAllInColl(m_reorderModels); // small score producers delete m_distortionScoreProducer; delete m_wpProducer; // memory pools Phrase::FinalizeMemPool();}bool StaticData::LoadLexicalReorderingModel(){ // load Lexical Reordering model //distortion weights const vector<string> distortionWeights = m_parameter->GetParam("weight-d"); //distortional model weights (first weight is distance distortion) std::vector<float> distortionModelWeights; for(size_t dist=1; dist < distortionWeights.size(); dist++) { distortionModelWeights.push_back(Scan<float>(distortionWeights[dist])); } const vector<string> &lrFileVector = m_parameter->GetParam("distortion-file"); for(unsigned int i=0; i< lrFileVector.size(); i++ ) //loops for each distortion model { vector<string> specification = Tokenize<string>(lrFileVector[i]," "); if (specification.size() != 4 ) { TRACE_ERR("ERROR: Expected format 'factors type weight-count filePath' in specification of distortion file " << i << std::endl << lrFileVector[i] << std::endl); return false; } //defaults, but at least one of these per model should be explicitly specified in the .ini file int orientation = DistortionOrientationType::Msd, direction = LexReorderType::Backward, condition = LexReorderType::Fe; //Loop through, overriding defaults with specifications vector<string> parameters = Tokenize<string>(specification[1],"-"); for (size_t param=0; param<parameters.size(); param++) { string val = ToLower(parameters[param]); //orientation if(val == "monotone" || val == "monotonicity") orientation = DistortionOrientationType::Monotone; else if(val == "msd" || val == "orientation") orientation = DistortionOrientationType::Msd; //direction else if(val == "forward") direction = LexReorderType::Forward; else if(val == "backward" || val == "unidirectional") direction = LexReorderType::Backward; else if(val == "bidirectional") direction = LexReorderType::Bidirectional; //condition else if(val == "f") condition = LexReorderType::F; else if(val == "fe") condition = LexReorderType::Fe; //unknown specification else { TRACE_ERR("ERROR: Unknown orientation type specification '" << val << "'" << endl); return false; } if (orientation == DistortionOrientationType::Msd) m_sourceStartPosMattersForRecombination = true; } //compute the number of weights that ought to be in the table from this size_t numWeightsInTable = 0; if(orientation == DistortionOrientationType::Monotone) { numWeightsInTable = 2; } else { numWeightsInTable = 3; } if(direction == LexReorderType::Bidirectional) { numWeightsInTable *= 2; } size_t specifiedNumWeights = Scan<size_t>(specification[2]); if (specifiedNumWeights != numWeightsInTable) { stringstream strme; strme << "specified number of weights (" << specifiedNumWeights << ") does not match correct number of weights for this type (" << numWeightsInTable << std::endl; UserMessage::Add(strme.str()); } //factors involved in this table vector<string> inputfactors = Tokenize(specification[0],"-"); vector<FactorType> input,output; if(inputfactors.size() > 1) { input = Tokenize<FactorType>(inputfactors[0],","); output= Tokenize<FactorType>(inputfactors[1],","); } else { input.push_back(0); // default, just in case the user is actually using a bidirectional model output = Tokenize<FactorType>(inputfactors[0],","); } std::vector<float> m_lexWeights; //will store the weights for this particular distortion reorderer std::vector<float> newLexWeights; //we'll remove the weights used by this distortion reorder, leaving the weights yet to be used if(specifiedNumWeights == 1) // this is useful if the user just wants to train one weight for the model { //add appropriate weight to weight vector assert(distortionModelWeights.size()> 0); //if this fails the user has not specified enough weights float wgt = distortionModelWeights[0]; for(size_t i=0; i<numWeightsInTable; i++) { m_lexWeights.push_back(wgt); } //update the distortionModelWeight vector to remove these weights std::vector<float> newLexWeights; //plus one as the first weight should always be distance-distortion for(size_t i=1; i<distortionModelWeights.size(); i++) { newLexWeights.push_back(distortionModelWeights[i]); } distortionModelWeights = newLexWeights; } else { //add appropriate weights to weight vector for(size_t i=0; i< numWeightsInTable; i++) { assert(i < distortionModelWeights.size()); //if this fails the user has not specified enough weights m_lexWeights.push_back(distortionModelWeights[i]);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?