📄 staticdata.cpp.svn-base
字号:
// $Id$// vim:tabstop=2/***********************************************************************Moses - factored phrase-based language decoderCopyright (C) 2006 University of EdinburghThis library is free software; you can redistribute it and/ormodify it under the terms of the GNU Lesser General PublicLicense as published by the Free Software Foundation; eitherversion 2.1 of the License, or (at your option) any later version.This library is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNULesser General Public License for more details.You should have received a copy of the GNU Lesser General PublicLicense along with this library; if not, write to the Free SoftwareFoundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA***********************************************************************/#include <string>#include <cassert>#include "PhraseDictionaryMemory.h"#include "DecodeStepTranslation.h"#include "DecodeStepGeneration.h"#include "GenerationDictionary.h"#include "DummyScoreProducers.h"#include "StaticData.h"#include "Util.h"#include "FactorCollection.h"#include "HypothesisStack.h"#include "Timer.h"#include "LanguageModelSingleFactor.h"#include "LanguageModelMultiFactor.h"#include "LanguageModelFactory.h"#include "LexicalReordering.h"#include "SentenceStats.h"#include "PhraseDictionaryTreeAdaptor.h"#include "UserMessage.h"#include "TranslationOption.h"#include "DecodeGraph.h"using namespace std;static size_t CalcMax(size_t x, const vector<size_t>& y) { size_t max = x; for (vector<size_t>::const_iterator i=y.begin(); i != y.end(); ++i) if (*i > max) max = *i; return max;}static size_t CalcMax(size_t x, const vector<size_t>& y, const vector<size_t>& z) { size_t max = x; for (vector<size_t>::const_iterator i=y.begin(); i != y.end(); ++i) if (*i > max) max = *i; for (vector<size_t>::const_iterator i=z.begin(); i != z.end(); ++i) if (*i > max) max = *i; return max;}StaticData StaticData::s_instance;StaticData::StaticData():m_fLMsLoaded(false),m_inputType(SentenceInput),m_numInputScores(0),m_distortionScoreProducer(0),m_wpProducer(0),m_useDistortionFutureCosts(false),m_isDetailedTranslationReportingEnabled(false) ,m_onlyDistinctNBest(false),m_computeLMBackoffStats(false),m_factorDelimiter("|") // default delimiter between factors,m_isAlwaysCreateDirectTranslationOption(true){ m_maxFactorIdx[0] = 0; // source side m_maxFactorIdx[1] = 0; // target side // memory pools Phrase::InitializeMemPool();}bool StaticData::LoadData(Parameter *parameter){ ResetUserTime(); m_parameter = parameter; // verbose level m_verboseLevel = 1; if (m_parameter->GetParam("verbose").size() == 1) { m_verboseLevel = Scan<size_t>( m_parameter->GetParam("verbose")[0]); } // input type has to be specified BEFORE loading the phrase tables! if(m_parameter->GetParam("inputtype").size()) m_inputType= (InputTypeEnum) Scan<int>(m_parameter->GetParam("inputtype")[0]); std::string s_it = "text input"; if (m_inputType == 1) { s_it = "confusion net"; } if (m_inputType == 2) { s_it = "word lattice"; } VERBOSE(2,"input type is: "<<s_it<<"\n"); if(m_parameter->GetParam("recover-input-path").size()) { m_recoverPath = Scan<bool>(m_parameter->GetParam("recover-input-path")[0]); if (m_recoverPath && m_inputType == SentenceInput) { TRACE_ERR("--recover-input-path should only be used with confusion net or word lattice input!\n"); m_recoverPath = false; } } // factor delimiter if (m_parameter->GetParam("factor-delimiter").size() > 0) { m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0]; } // n-best if (m_parameter->GetParam("n-best-list").size() >= 2) { m_nBestFilePath = m_parameter->GetParam("n-best-list")[0]; m_nBestSize = Scan<size_t>( m_parameter->GetParam("n-best-list")[1] ); m_onlyDistinctNBest=(m_parameter->GetParam("n-best-list").size()>2 && m_parameter->GetParam("n-best-list")[2]=="distinct"); if (m_parameter->GetParam("n-best-factor").size() > 0) { m_nBestFactor = Scan<size_t>( m_parameter->GetParam("n-best-factor")[0]); } } else { m_nBestSize = 0; } // include feature names in the n-best list SetBooleanParameter( &m_labeledNBestList, "labeled-n-best-list", true ); // include word alignment in the n-best list SetBooleanParameter( &m_nBestIncludesAlignment, "include-alignment-in-n-best", false ); // printing source phrase spans SetBooleanParameter( &m_reportSegmentation, "report-segmentation", false ); // print all factors of output translations SetBooleanParameter( &m_reportAllFactors, "report-all-factors", false ); // if (m_inputType == SentenceInput) { SetBooleanParameter( &m_useTransOptCache, "use-persistent-cache", true ); } else { m_useTransOptCache = false; } //input factors const vector<string> &inputFactorVector = m_parameter->GetParam("input-factors"); for(size_t i=0; i<inputFactorVector.size(); i++) { m_inputFactorOrder.push_back(Scan<FactorType>(inputFactorVector[i])); } if(m_inputFactorOrder.empty()) { UserMessage::Add(string("no input factor specified in config file")); return false; } //output factors const vector<string> &outputFactorVector = m_parameter->GetParam("output-factors"); for(size_t i=0; i<outputFactorVector.size(); i++) { m_outputFactorOrder.push_back(Scan<FactorType>(outputFactorVector[i])); } if(m_outputFactorOrder.empty()) { // default. output factor 0 m_outputFactorOrder.push_back(0); } //source word deletion SetBooleanParameter( &m_wordDeletionEnabled, "phrase-drop-allowed", false ); // additional output SetBooleanParameter( &m_isDetailedTranslationReportingEnabled, "translation-details", false ); SetBooleanParameter( &m_computeLMBackoffStats, "lmstats", false ); if (m_computeLMBackoffStats && ! m_isDetailedTranslationReportingEnabled) { VERBOSE(1, "-lmstats implies -translation-details, enabling" << std::endl); m_isDetailedTranslationReportingEnabled = true; } // score weights const vector<string> distortionWeights = m_parameter->GetParam("weight-d"); m_weightDistortion = Scan<float>(distortionWeights[0]); m_weightWordPenalty = Scan<float>( m_parameter->GetParam("weight-w")[0] ); m_weightUnknownWord = 1; // do we want to let mert decide weight for this ??? m_distortionScoreProducer = new DistortionScoreProducer(m_scoreIndexManager); m_allWeights.push_back(m_weightDistortion); m_wpProducer = new WordPenaltyProducer(m_scoreIndexManager); m_allWeights.push_back(m_weightWordPenalty); m_unknownWordPenaltyProducer = new UnknownWordPenaltyProducer(m_scoreIndexManager); m_allWeights.push_back(m_weightUnknownWord); // misc m_maxHypoStackSize = (m_parameter->GetParam("stack").size() > 0) ? Scan<size_t>(m_parameter->GetParam("stack")[0]) : DEFAULT_MAX_HYPOSTACK_SIZE; m_maxDistortion = (m_parameter->GetParam("distortion-limit").size() > 0) ? Scan<int>(m_parameter->GetParam("distortion-limit")[0]) : -1; m_useDistortionFutureCosts = (m_parameter->GetParam("use-distortion-future-costs").size() > 0) ? Scan<bool>(m_parameter->GetParam("use-distortion-future-costs")[0]) : false; m_beamWidth = (m_parameter->GetParam("beam-threshold").size() > 0) ? TransformScore(Scan<float>(m_parameter->GetParam("beam-threshold")[0])) : TransformScore(DEFAULT_BEAM_WIDTH); m_maxNoTransOptPerCoverage = (m_parameter->GetParam("max-trans-opt-per-coverage").size() > 0) ? Scan<size_t>(m_parameter->GetParam("max-trans-opt-per-coverage")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE; m_maxNoPartTransOpt = (m_parameter->GetParam("max-partial-trans-opt").size() > 0) ? Scan<size_t>(m_parameter->GetParam("max-partial-trans-opt")[0]) : DEFAULT_MAX_PART_TRANS_OPT_SIZE; m_maxPhraseLength = (m_parameter->GetParam("max-phrase-length").size() > 0) ? Scan<size_t>(m_parameter->GetParam("max-phrase-length")[0]) : DEFAULT_MAX_PHRASE_LENGTH; // Unknown Word Processing -- wade //TODO replace this w/general word dropping -- EVH SetBooleanParameter( &m_dropUnknown, "drop-unknown", false ); m_decoderType = (DecoderType) ((m_parameter->GetParam("decoder-type").size() > 0) ? Scan<int>(m_parameter->GetParam("decoder-type")[0]) : 0); m_mbrScale = (m_parameter->GetParam("mbr-scale").size() > 0) ? Scan<float>(m_parameter->GetParam("mbr-scale")[0]) : 1.0f; //default case if (m_parameter->GetParam("xml-input").size() == 0) m_xmlInputType = XmlPassThrough; else if (m_parameter->GetParam("xml-input")[0]=="exclusive") m_xmlInputType = XmlExclusive; else if (m_parameter->GetParam("xml-input")[0]=="inclusive") m_xmlInputType = XmlInclusive; else if (m_parameter->GetParam("xml-input")[0]=="ignore") m_xmlInputType = XmlIgnore; else if (m_parameter->GetParam("xml-input")[0]=="pass-through") m_xmlInputType = XmlPassThrough; else { UserMessage::Add("invalid xml-input value, must be pass-through, exclusive, inclusive, or ignore"); return false; } if (!LoadLexicalReorderingModel()) return false; if (!LoadLanguageModels()) return false; if (!LoadGenerationTables()) return false; if (!LoadPhraseTables()) return false; if (!LoadMapping()) return false; return true;}void StaticData::SetBooleanParameter( bool *parameter, string parameterName, bool defaultValue ) { // default value if nothing is specified *parameter = defaultValue; if (! m_parameter->isParamSpecified( parameterName ) ) { return; } // if parameter is just specified as, e.g. "-parameter" set it true if (m_parameter->GetParam( parameterName ).size() == 0) { *parameter = true; } // if paramter is specified "-parameter true" or "-parameter false" else if (m_parameter->GetParam( parameterName ).size() == 1) { *parameter = Scan<bool>( m_parameter->GetParam( parameterName )[0]); }}StaticData::~StaticData(){ delete m_parameter; RemoveAllInColl(m_phraseDictionary); RemoveAllInColl(m_generationDictionary); RemoveAllInColl(m_languageModel); RemoveAllInColl(m_decodeStepVL); // delete trans opt map<Phrase, std::vector<TranslationOption*> >::iterator iterCache; for (iterCache = m_transOptCache.begin() ; iterCache != m_transOptCache.end() ; ++iterCache) { TranslationOptionList &transOptList = iterCache->second; RemoveAllInColl(transOptList); } RemoveAllInColl(m_reorderModels); // small score producers delete m_distortionScoreProducer; delete m_wpProducer; delete m_unknownWordPenaltyProducer; // memory pools Phrase::FinalizeMemPool();}bool StaticData::LoadLexicalReorderingModel(){ std::cerr << "Loading lexical distortion models...\n"; const vector<string> fileStr = m_parameter->GetParam("distortion-file"); const vector<string> weightsStr = m_parameter->GetParam("weight-d"); /*old code const vector<string> modelStr = m_parameter.GetParam("distortion-type"); //TODO check name? const vector<string> fileStr = m_parameter.GetParam("distortion-file"); const vector<string> weightsStr = m_parameter.GetParam("weight-d"); */ std::vector<float> weights; int w = 1; //cur weight int f = 0; //cur file //get weights values std::cerr << "have " << fileStr.size() << " models\n"; for(size_t j = 0; j < weightsStr.size(); ++j){ weights.push_back(Scan<float>(weightsStr[j])); } //load all models for(size_t i = 0; i < fileStr.size(); ++i) { //std::cerr << "Model " << i << ":"; //Todo: 'else' should be 'else if(...)' to check it is a lexical model... vector<string> spec = Tokenize<string>(fileStr[f], " "); ++f; //mark file as consumed if(4 != spec.size()){ //wrong file specification string... std::cerr << "Wrong Lexical Reordering Model Specification for model " << i << "!\n"; return false; } //spec[0] = factor map //spec[1] = name //spec[2] = num weights //spec[3] = fileName //decode data into these vector<FactorType> input,output; LexicalReordering::Direction direction; LexicalReordering::Condition condition; int numWeights; //decode factor map vector<string> inputfactors = Tokenize(spec[0],"-"); if(inputfactors.size() == 2){ input = Tokenize<FactorType>(inputfactors[0],","); output = Tokenize<FactorType>(inputfactors[1],","); } else if(inputfactors.size() == 1) { //if there is only one side assume it is on e side... why? output = Tokenize<FactorType>(inputfactors[0],","); } else { //format error return false; } //decode name vector<string> params = Tokenize<string>(spec[1],"-"); std::string type(ToLower(params[0])); std::string dir; std::string cond; if(3 == params.size()) { //name format is 'type'-'direction'-'condition' dir = ToLower(params[1]); cond = ToLower(params[2]); } else if(2 == params.size()) { //assume name format is 'type'-'condition' with implicit unidirectional std::cerr << "Warning: Lexical model type underspecified...assuming unidirectional in model " << i << "\n"; dir = "unidirectional"; cond = ToLower(params[1]); } else { std::cerr << "Lexical model type underspecified for model " << i << "!\n"; return false; } if(dir == "forward"){ direction = LexicalReordering::Forward; } else if(dir == "backward" || dir == "unidirectional" || dir == "uni") { direction = LexicalReordering::Backward; } else if(dir == "bidirectional" || dir == "bi") { direction = LexicalReordering::Bidirectional; } else { std::cerr << "Unknown direction declaration '" << dir << "'for lexical reordering model " << i << "\n"; return false; } if(cond == "f"){ condition = LexicalReordering::F; } else if(cond == "fe") { condition = LexicalReordering::FE; } else if(cond == "fec") { condition = LexicalReordering::FEC; } else { std::cerr << "Unknown conditioning declaration '" << cond << "'for lexical reordering model " << i << "!\n"; return false; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -