📄 staticdata.cpp.svn-base
字号:
//decode num weights (and fetch weight from array...) std::vector<float> mweights; numWeights = atoi(spec[2].c_str()); for(size_t k = 0; k < numWeights; ++k, ++w) { if(w >= weights.size()){ //error not enough weights... std::cerr << "Lexicalized distortion model: Not enough weights, add to [weight-d]\n"; return false; } else { mweights.push_back(weights[w]); } } //decode filename string filePath = spec[3]; //all ready load it //std::cerr << type; if("monotonicity" == type){ m_reorderModels.push_back(new LexicalMonotonicReordering(filePath, mweights, direction, condition, input, output)); } else if("orientation" == type || "msd" == type) { m_reorderModels.push_back(new LexicalOrientationReordering(filePath, mweights, direction, condition, input, output)); } else if("directional" == type) { m_reorderModels.push_back(new LexicalDirectionalReordering(filePath, mweights, direction, condition, input, output)); } else { //error unknown type! std::cerr << " ...unknown type!\n"; return false; } //std::cerr << "\n"; } return true;}bool StaticData::LoadLanguageModels(){ if (m_parameter->GetParam("lmodel-file").size() > 0) { // weights vector<float> weightAll = Scan<float>(m_parameter->GetParam("weight-l")); for (size_t i = 0 ; i < weightAll.size() ; i++) { m_allWeights.push_back(weightAll[i]); } // initialize n-gram order for each factor. populated only by factored lm const vector<string> &lmVector = m_parameter->GetParam("lmodel-file"); for(size_t i=0; i<lmVector.size(); i++) { vector<string> token = Tokenize(lmVector[i]); if (token.size() != 4 && token.size() != 5 ) { UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'"); return false; } // type = implementation, SRI, IRST etc LMImplementation lmImplementation = static_cast<LMImplementation>(Scan<int>(token[0])); // factorType = 0 = Surface, 1 = POS, 2 = Stem, 3 = Morphology, etc vector<FactorType> factorTypes = Tokenize<FactorType>(token[1], ","); // nGramOrder = 2 = bigram, 3 = trigram, etc size_t nGramOrder = Scan<int>(token[2]); string &languageModelFile = token[3]; if (token.size() == 5) if (lmImplementation==IRST) languageModelFile += " " + token[4]; else { UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'"); return false; } IFVERBOSE(1) PrintUserTime(string("Start loading LanguageModel ") + languageModelFile); LanguageModel *lm = LanguageModelFactory::CreateLanguageModel( lmImplementation , factorTypes , nGramOrder , languageModelFile , weightAll[i] , m_scoreIndexManager); if (lm == NULL) { UserMessage::Add("no LM created. We probably don't have it compiled"); return false; } m_languageModel.push_back(lm); } } // flag indicating that language models were loaded, // since phrase table loading requires their presence m_fLMsLoaded = true; IFVERBOSE(1) PrintUserTime("Finished loading LanguageModels"); return true;}bool StaticData::LoadGenerationTables(){ if (m_parameter->GetParam("generation-file").size() > 0) { const vector<string> &generationVector = m_parameter->GetParam("generation-file"); const vector<float> &weight = Scan<float>(m_parameter->GetParam("weight-generation")); IFVERBOSE(1) { TRACE_ERR( "weight-generation: "); for (size_t i = 0 ; i < weight.size() ; i++) { TRACE_ERR( weight[i] << "\t"); } TRACE_ERR(endl); } size_t currWeightNum = 0; for(size_t currDict = 0 ; currDict < generationVector.size(); currDict++) { vector<string> token = Tokenize(generationVector[currDict]); vector<FactorType> input = Tokenize<FactorType>(token[0], ",") ,output = Tokenize<FactorType>(token[1], ","); m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], input, output); string filePath; size_t numFeatures; numFeatures = Scan<size_t>(token[2]); filePath = token[3]; if (!FileExists(filePath) && FileExists(filePath + ".gz")) { filePath += ".gz"; } VERBOSE(1, filePath << endl); m_generationDictionary.push_back(new GenerationDictionary(numFeatures, m_scoreIndexManager)); assert(m_generationDictionary.back() && "could not create GenerationDictionary"); if (!m_generationDictionary.back()->Load(input , output , filePath , Output)) { delete m_generationDictionary.back(); return false; } for(size_t i = 0; i < numFeatures; i++) { assert(currWeightNum < weight.size()); m_allWeights.push_back(weight[currWeightNum++]); } } if (currWeightNum != weight.size()) { TRACE_ERR( " [WARNING] config file has " << weight.size() << " generation weights listed, but the configuration for generation files indicates there should be " << currWeightNum << "!\n"); } } return true;}bool StaticData::LoadPhraseTables(){ VERBOSE(2,"About to LoadPhraseTables" << endl); // language models must be loaded prior to loading phrase tables assert(m_fLMsLoaded); // load phrase translation tables if (m_parameter->GetParam("ttable-file").size() > 0) { // weights vector<float> weightAll = Scan<float>(m_parameter->GetParam("weight-t")); const vector<string> &translationVector = m_parameter->GetParam("ttable-file"); vector<size_t> maxTargetPhrase = Scan<size_t>(m_parameter->GetParam("ttable-limit")); size_t index = 0; size_t weightAllOffset = 0; for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++) { vector<string> token = Tokenize(translationVector[currDict]); //characteristics of the phrase table vector<FactorType> input = Tokenize<FactorType>(token[0], ",") ,output = Tokenize<FactorType>(token[1], ","); m_maxFactorIdx[0] = CalcMax(m_maxFactorIdx[0], input); m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], output); m_maxNumFactors = std::max(m_maxFactorIdx[0], m_maxFactorIdx[1]) + 1; string filePath= token[3]; size_t numScoreComponent = Scan<size_t>(token[2]); assert(weightAll.size() >= weightAllOffset + numScoreComponent); // weights for this phrase dictionary // first InputScores (if any), then translation scores vector<float> weight; if(currDict==0 && m_inputType) { // TODO. find what the assumptions made by confusion network about phrase table output which makes // it only work with binrary file. This is a hack m_numInputScores=m_parameter->GetParam("weight-i").size(); for(unsigned k=0;k<m_numInputScores;++k) weight.push_back(Scan<float>(m_parameter->GetParam("weight-i")[k])); } else{ m_numInputScores=0; } for (size_t currScore = 0 ; currScore < numScoreComponent; currScore++) weight.push_back(weightAll[weightAllOffset + currScore]); if(weight.size() - m_numInputScores != numScoreComponent) { stringstream strme; strme << "Your phrase table has " << numScoreComponent << " scores, but you specified " << weight.size() << " weights!"; UserMessage::Add(strme.str()); return false; } weightAllOffset += numScoreComponent; numScoreComponent += m_numInputScores; assert(numScoreComponent==weight.size()); std::copy(weight.begin(),weight.end(),std::back_inserter(m_allWeights)); IFVERBOSE(1) PrintUserTime(string("Start loading PhraseTable ") + filePath); if (!FileExists(filePath+".binphr.idx")) { // memory phrase table VERBOSE(2,"using standard phrase tables"); if (m_inputType != SentenceInput) { UserMessage::Add("Must use binary phrase table for this input type"); return false; } PhraseDictionaryMemory *pd=new PhraseDictionaryMemory(numScoreComponent); if (!pd->Load(input , output , filePath , weight , maxTargetPhrase[index] , GetAllLM() , GetWeightWordPenalty())) { delete pd; return false; } m_phraseDictionary.push_back(pd); } else { // binary phrase table VERBOSE(1, "using binary phrase tables for idx "<<currDict<<"\n"); PhraseDictionaryTreeAdaptor *pd=new PhraseDictionaryTreeAdaptor(numScoreComponent,(currDict==0 ? m_numInputScores : 0)); if (!pd->Load(input,output,filePath,weight, maxTargetPhrase[index], GetAllLM(), GetWeightWordPenalty())) { delete pd; return false; } m_phraseDictionary.push_back(pd); } index++; } } IFVERBOSE(1) PrintUserTime("Finished loading phrase tables"); return true;}bool StaticData::LoadMapping(){ // mapping const vector<string> &mappingVector = m_parameter->GetParam("mapping"); DecodeStep *prev = 0; size_t previousVectorList = 0; for(size_t i=0; i<mappingVector.size(); i++) { vector<string> token = Tokenize(mappingVector[i]); size_t vectorList; DecodeType decodeType; size_t index; if (token.size() == 2) { vectorList = 0; decodeType = token[0] == "T" ? Translate : Generate; index = Scan<size_t>(token[1]); } //Smoothing else if (token.size() == 3) { vectorList = Scan<size_t>(token[0]); //the vectorList index can only increment by one assert(vectorList == previousVectorList || vectorList == previousVectorList + 1); if (vectorList > previousVectorList) { prev = NULL; } decodeType = token[1] == "T" ? Translate : Generate; index = Scan<size_t>(token[2]); } else { UserMessage::Add("Malformed mapping!"); return false; } DecodeStep* decodeStep = 0; switch (decodeType) { case Translate: if(index>=m_phraseDictionary.size()) { stringstream strme; strme << "No phrase dictionary with index " << index << " available!"; UserMessage::Add(strme.str()); return false; } decodeStep = new DecodeStepTranslation(m_phraseDictionary[index], prev); break; case Generate: if(index>=m_generationDictionary.size()) { stringstream strme; strme << "No generation dictionary with index " << index << " available!"; UserMessage::Add(strme.str()); return false; } decodeStep = new DecodeStepGeneration(m_generationDictionary[index], prev); break; case InsertNullFertilityWord: assert(!"Please implement NullFertilityInsertion."); break; } assert(decodeStep); if (m_decodeStepVL.size() < vectorList + 1) { m_decodeStepVL.push_back(new DecodeGraph()); } m_decodeStepVL[vectorList]->Add(decodeStep); prev = decodeStep; previousVectorList = vectorList; } return true;}void StaticData::CleanUpAfterSentenceProcessing() const{ for(size_t i=0;i<m_phraseDictionary.size();++i) m_phraseDictionary[i]->CleanUp(); for(size_t i=0;i<m_generationDictionary.size();++i) m_generationDictionary[i]->CleanUp(); //something LMs could do after each sentence LMList::const_iterator iterLM; for (iterLM = m_languageModel.begin() ; iterLM != m_languageModel.end() ; ++iterLM) { LanguageModel &languageModel = **iterLM; languageModel.CleanUpAfterSentenceProcessing(); }}/** initialize the translation and language models for this sentence (includes loading of translation table entries on demand, if binary format is used) */void StaticData::InitializeBeforeSentenceProcessing(InputType const& in) const{ m_input = ∈ for(size_t i=0;i<m_phraseDictionary.size();++i) { m_phraseDictionary[i]->InitializeForInput(in); } for(size_t j=0;j<m_reorderModels.size();++j){ m_reorderModels[j]->InitializeForInput(in); } //something LMs could do before translating a sentence LMList::const_iterator iterLM; for (iterLM = m_languageModel.begin() ; iterLM != m_languageModel.end() ; ++iterLM) { LanguageModel &languageModel = **iterLM; languageModel.InitializeBeforeSentenceProcessing(); } }void StaticData::SetWeightsForScoreProducer(const ScoreProducer* sp, const std::vector<float>& weights){ const size_t id = sp->GetScoreBookkeepingID(); const size_t begin = m_scoreIndexManager.GetBeginIndex(id); const size_t end = m_scoreIndexManager.GetEndIndex(id); assert(end - begin == weights.size()); if (m_allWeights.size() < end) m_allWeights.resize(end); std::vector<float>::const_iterator weightIter = weights.begin(); for (size_t i = begin; i < end; i++) m_allWeights[i] = *weightIter++;}const TranslationOptionList* StaticData::FindTransOptListInCache(const Phrase &sourcePhrase) const{ std::map<Phrase, TranslationOptionList>::const_iterator iter = m_transOptCache.find(sourcePhrase); if (iter == m_transOptCache.end()) return NULL; return &(iter->second);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -