📄 staticdata.cpp.svn-base

📁 moses开源的机器翻译系统
💻 SVN-BASE
📖 第 1 页 / 共 2 页
字号:
上一页 12
		//decode num weights (and fetch weight from array...)		std::vector<float> mweights;		numWeights = atoi(spec[2].c_str());		for(size_t k = 0; k < numWeights; ++k, ++w)		{			if(w >= weights.size()){				//error not enough weights...				std::cerr << "Lexicalized distortion model: Not enough weights, add to [weight-d]\n";				return false;			} else {				mweights.push_back(weights[w]);			}		}    		//decode filename		string filePath = spec[3];		//all ready load it		//std::cerr << type;		if("monotonicity" == type){			m_reorderModels.push_back(new LexicalMonotonicReordering(filePath, mweights, direction, condition, input, output));		} 		else if("orientation" == type || "msd" == type)		{			m_reorderModels.push_back(new LexicalOrientationReordering(filePath, mweights, direction, condition, input, output));		} 		else if("directional" == type)		{			m_reorderModels.push_back(new LexicalDirectionalReordering(filePath, mweights, direction, condition, input, output));		} 		else 		{			//error unknown type!			std::cerr << " ...unknown type!\n";			return false;		}		//std::cerr << "\n";	}   return true;}bool StaticData::LoadLanguageModels(){	if (m_parameter->GetParam("lmodel-file").size() > 0)	{		// weights		vector<float> weightAll = Scan<float>(m_parameter->GetParam("weight-l"));				for (size_t i = 0 ; i < weightAll.size() ; i++)		{			m_allWeights.push_back(weightAll[i]);		}	  // initialize n-gram order for each factor. populated only by factored lm		const vector<string> &lmVector = m_parameter->GetParam("lmodel-file");		for(size_t i=0; i<lmVector.size(); i++) 		{			vector<string>	token		= Tokenize(lmVector[i]);			if (token.size() != 4 && token.size() != 5 )			{				UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'");				return false;			}			// type = implementation, SRI, IRST etc			LMImplementation lmImplementation = static_cast<LMImplementation>(Scan<int>(token[0]));						// factorType = 0 = Surface, 1 = POS, 2 = Stem, 3 = Morphology, etc			vector<FactorType> 	factorTypes		= Tokenize<FactorType>(token[1], ",");						// nGramOrder = 2 = bigram, 3 = trigram, etc			size_t nGramOrder = Scan<int>(token[2]);						string &languageModelFile = token[3];			if (token.size() == 5)			  if (lmImplementation==IRST)			    languageModelFile += " " + token[4];			  else {			    UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'");			    return false;			  }			IFVERBOSE(1)				PrintUserTime(string("Start loading LanguageModel ") + languageModelFile);						LanguageModel *lm = LanguageModelFactory::CreateLanguageModel(																									lmImplementation																									, factorTypes                                        								, nGramOrder																									, languageModelFile																									, weightAll[i]																									, m_scoreIndexManager);      if (lm == NULL)       {      	UserMessage::Add("no LM created. We probably don't have it compiled");      	return false;      }			m_languageModel.push_back(lm);		}	}  // flag indicating that language models were loaded,  // since phrase table loading requires their presence  m_fLMsLoaded = true;	IFVERBOSE(1)		PrintUserTime("Finished loading LanguageModels");  return true;}bool StaticData::LoadGenerationTables(){	if (m_parameter->GetParam("generation-file").size() > 0) 	{		const vector<string> &generationVector = m_parameter->GetParam("generation-file");		const vector<float> &weight = Scan<float>(m_parameter->GetParam("weight-generation"));		IFVERBOSE(1)		{			TRACE_ERR( "weight-generation: ");			for (size_t i = 0 ; i < weight.size() ; i++)			{					TRACE_ERR( weight[i] << "\t");			}			TRACE_ERR(endl);		}		size_t currWeightNum = 0;				for(size_t currDict = 0 ; currDict < generationVector.size(); currDict++) 		{			vector<string>			token		= Tokenize(generationVector[currDict]);			vector<FactorType> 	input		= Tokenize<FactorType>(token[0], ",")													,output	= Tokenize<FactorType>(token[1], ",");      m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], input, output);			string							filePath;			size_t							numFeatures;			numFeatures = Scan<size_t>(token[2]);			filePath = token[3];			if (!FileExists(filePath) && FileExists(filePath + ".gz")) {				filePath += ".gz";			}			VERBOSE(1, filePath << endl);			m_generationDictionary.push_back(new GenerationDictionary(numFeatures, m_scoreIndexManager));			assert(m_generationDictionary.back() && "could not create GenerationDictionary");			if (!m_generationDictionary.back()->Load(input																		, output																		, filePath																		, Output))			{				delete m_generationDictionary.back();				return false;			}			for(size_t i = 0; i < numFeatures; i++) {				assert(currWeightNum < weight.size());				m_allWeights.push_back(weight[currWeightNum++]);			}		}		if (currWeightNum != weight.size()) {			TRACE_ERR( "  [WARNING] config file has " << weight.size() << " generation weights listed, but the configuration for generation files indicates there should be " << currWeightNum << "!\n");		}	}		return true;}bool StaticData::LoadPhraseTables(){	VERBOSE(2,"About to LoadPhraseTables" << endl);	// language models must be loaded prior to loading phrase tables	assert(m_fLMsLoaded);	// load phrase translation tables  if (m_parameter->GetParam("ttable-file").size() > 0)	{		// weights		vector<float> weightAll									= Scan<float>(m_parameter->GetParam("weight-t"));				const vector<string> &translationVector = m_parameter->GetParam("ttable-file");		vector<size_t>	maxTargetPhrase					= Scan<size_t>(m_parameter->GetParam("ttable-limit"));				size_t index = 0;		size_t weightAllOffset = 0;		for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++) 		{			vector<string>                  token           = Tokenize(translationVector[currDict]);			//characteristics of the phrase table			vector<FactorType>      input           = Tokenize<FactorType>(token[0], ",")				,output = Tokenize<FactorType>(token[1], ",");			m_maxFactorIdx[0] = CalcMax(m_maxFactorIdx[0], input);			m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], output);      m_maxNumFactors = std::max(m_maxFactorIdx[0], m_maxFactorIdx[1]) + 1;			string filePath= token[3];			size_t numScoreComponent = Scan<size_t>(token[2]);			assert(weightAll.size() >= weightAllOffset + numScoreComponent);			// weights for this phrase dictionary			// first InputScores (if any), then translation scores			vector<float> weight;			if(currDict==0 && m_inputType)			{	// TODO. find what the assumptions made by confusion network about phrase table output which makes				// it only work with binrary file. This is a hack 					m_numInputScores=m_parameter->GetParam("weight-i").size();				for(unsigned k=0;k<m_numInputScores;++k)					weight.push_back(Scan<float>(m_parameter->GetParam("weight-i")[k]));			}			else{				m_numInputScores=0;			}						for (size_t currScore = 0 ; currScore < numScoreComponent; currScore++)				weight.push_back(weightAll[weightAllOffset + currScore]);									if(weight.size() - m_numInputScores != numScoreComponent) 			{				stringstream strme;				strme << "Your phrase table has " << numScoreComponent							<< " scores, but you specified " << weight.size() << " weights!";				UserMessage::Add(strme.str());				return false;			}									weightAllOffset += numScoreComponent;			numScoreComponent += m_numInputScores;									assert(numScoreComponent==weight.size());			std::copy(weight.begin(),weight.end(),std::back_inserter(m_allWeights));						IFVERBOSE(1)				PrintUserTime(string("Start loading PhraseTable ") + filePath);			if (!FileExists(filePath+".binphr.idx"))			{	// memory phrase table				VERBOSE(2,"using standard phrase tables");				if (m_inputType != SentenceInput)				{					UserMessage::Add("Must use binary phrase table for this input type");					return false;				}								PhraseDictionaryMemory *pd=new PhraseDictionaryMemory(numScoreComponent);				if (!pd->Load(input								 , output								 , filePath								 , weight								 , maxTargetPhrase[index]								 , GetAllLM()								 , GetWeightWordPenalty()))				{					delete pd;					return false;				}				m_phraseDictionary.push_back(pd);			}			else 			{ // binary phrase table				VERBOSE(1, "using binary phrase tables for idx "<<currDict<<"\n");				PhraseDictionaryTreeAdaptor *pd=new PhraseDictionaryTreeAdaptor(numScoreComponent,(currDict==0 ? m_numInputScores : 0));				if (!pd->Load(input,output,filePath,weight,									 maxTargetPhrase[index],									 GetAllLM(),									 GetWeightWordPenalty()))				{					delete pd;					return false;				}				m_phraseDictionary.push_back(pd);			}			index++;		}	}		IFVERBOSE(1)		PrintUserTime("Finished loading phrase tables");	return true;}bool StaticData::LoadMapping(){	// mapping	const vector<string> &mappingVector = m_parameter->GetParam("mapping");	DecodeStep *prev = 0;	size_t previousVectorList = 0;	for(size_t i=0; i<mappingVector.size(); i++) 	{		vector<string>	token		= Tokenize(mappingVector[i]);		size_t vectorList;		DecodeType decodeType;		size_t index;		if (token.size() == 2) 		{		  vectorList = 0;			decodeType = token[0] == "T" ? Translate : Generate;			index = Scan<size_t>(token[1]);		}		//Smoothing		else if (token.size() == 3) 		{		  vectorList = Scan<size_t>(token[0]);			//the vectorList index can only increment by one 			assert(vectorList == previousVectorList || vectorList == previousVectorList + 1);      if (vectorList > previousVectorList)       {        prev = NULL;      }			decodeType = token[1] == "T" ? Translate : Generate;			index = Scan<size_t>(token[2]);		}		 		else 		{			UserMessage::Add("Malformed mapping!");			return false;		}				DecodeStep* decodeStep = 0;		switch (decodeType) {			case Translate:				if(index>=m_phraseDictionary.size())					{						stringstream strme;						strme << "No phrase dictionary with index "									<< index << " available!";						UserMessage::Add(strme.str());						return false;					}				decodeStep = new DecodeStepTranslation(m_phraseDictionary[index], prev);			break;			case Generate:				if(index>=m_generationDictionary.size())					{						stringstream strme;						strme << "No generation dictionary with index "									<< index << " available!";						UserMessage::Add(strme.str());						return false;					}				decodeStep = new DecodeStepGeneration(m_generationDictionary[index], prev);			break;			case InsertNullFertilityWord:				assert(!"Please implement NullFertilityInsertion.");			break;		}		assert(decodeStep);		if (m_decodeStepVL.size() < vectorList + 1) 		{			m_decodeStepVL.push_back(new DecodeGraph());		}		m_decodeStepVL[vectorList]->Add(decodeStep);		prev = decodeStep;		previousVectorList = vectorList;	}		return true;}void StaticData::CleanUpAfterSentenceProcessing() const{	for(size_t i=0;i<m_phraseDictionary.size();++i)		m_phraseDictionary[i]->CleanUp();	for(size_t i=0;i<m_generationDictionary.size();++i)		m_generationDictionary[i]->CleanUp();    //something LMs could do after each sentence   LMList::const_iterator iterLM;	for (iterLM = m_languageModel.begin() ; iterLM != m_languageModel.end() ; ++iterLM)	{		LanguageModel &languageModel = **iterLM;    languageModel.CleanUpAfterSentenceProcessing();	}}/** initialize the translation and language models for this sentence     (includes loading of translation table entries on demand, if    binary format is used) */void StaticData::InitializeBeforeSentenceProcessing(InputType const& in) const{  m_input = &in;  for(size_t i=0;i<m_phraseDictionary.size();++i) {	m_phraseDictionary[i]->InitializeForInput(in);  }  for(size_t j=0;j<m_reorderModels.size();++j){	m_reorderModels[j]->InitializeForInput(in);  }  //something LMs could do before translating a sentence  LMList::const_iterator iterLM;	for (iterLM = m_languageModel.begin() ; iterLM != m_languageModel.end() ; ++iterLM)	{		LanguageModel &languageModel = **iterLM;    languageModel.InitializeBeforeSentenceProcessing();	}  }void StaticData::SetWeightsForScoreProducer(const ScoreProducer* sp, const std::vector<float>& weights){  const size_t id = sp->GetScoreBookkeepingID();  const size_t begin = m_scoreIndexManager.GetBeginIndex(id);  const size_t end = m_scoreIndexManager.GetEndIndex(id);  assert(end - begin == weights.size());  if (m_allWeights.size() < end)    m_allWeights.resize(end);  std::vector<float>::const_iterator weightIter = weights.begin();  for (size_t i = begin; i < end; i++)    m_allWeights[i] = *weightIter++;}const TranslationOptionList* StaticData::FindTransOptListInCache(const Phrase &sourcePhrase) const{	std::map<Phrase, TranslationOptionList>::const_iterator iter			= m_transOptCache.find(sourcePhrase);	if (iter == m_transOptCache.end())		return NULL;	return &(iter->second);}
上一页 12
💿 文件大小 8836 K
👤 上传用户 myhpgnl
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#moses #开源 #机器翻译系统
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -