📄 languagemodelirst.cpp.svn-base
字号:
// $Id$/***********************************************************************Moses - factored phrase-based language decoderCopyright (C) 2006 University of EdinburghThis library is free software; you can redistribute it and/ormodify it under the terms of the GNU Lesser General PublicLicense as published by the Free Software Foundation; eitherversion 2.1 of the License, or (at your option) any later version.This library is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNULesser General Public License for more details.You should have received a copy of the GNU Lesser General PublicLicense along with this library; if not, write to the Free SoftwareFoundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA***********************************************************************/#include <cassert>#include <limits>#include <iostream>#include <fstream>#include "dictionary.h"#include "n_gram.h"#include "lmtable.h"#include "lmmacro.h"#include "LanguageModelIRST.h"#include "TypeDef.h"#include "Util.h"#include "FactorCollection.h"#include "Phrase.h"#include "InputFileStream.h"#include "StaticData.h"using namespace std;LanguageModelIRST::LanguageModelIRST(bool registerScore, ScoreIndexManager &scoreIndexManager):LanguageModelSingleFactor(registerScore, scoreIndexManager),m_lmtb(0){}LanguageModelIRST::~LanguageModelIRST(){ delete m_lmtb; delete m_lmtb_ng;}bool LanguageModelIRST::Load(const std::string &filePath, FactorType factorType, float weight, size_t nGramOrder){ char *SepString = " \t\n"; cerr << "In LanguageModelIRST::Load: nGramOrder = " << nGramOrder << "\n"; FactorCollection &factorCollection = FactorCollection::Instance(); m_factorType = factorType; m_weight = weight; m_nGramOrder = nGramOrder; // get name of LM file and, if any, of the micro-macro map file char *filenames = strdup(filePath.c_str()); m_filePath = strsep(&filenames, SepString); // Open the input file (possibly gzipped) InputFileStream inp(m_filePath); if (filenames) { // case LMfile + MAPfile: create an object of lmmacro class and load both LM file and map cerr << "Loading LM file + MAP\n"; m_mapFilePath = strsep(&filenames, SepString); if (!FileExists(m_mapFilePath)) { cerr << "ERROR: Map file <" << m_mapFilePath << "> does not exist\n"; return false; } InputFileStream inpMap(m_mapFilePath); m_lmtb = new lmmacro(m_filePath, inp, inpMap); } else { // case (standard) LMfile only: create an object of lmtable cerr << "Loading LM file (no MAP)\n"; m_lmtb = (lmtable *)new lmtable; // Load the (possibly binary) model#ifdef WIN32 m_lmtb->load(inp); //don't use memory map#else if (m_filePath.compare(m_filePath.size()-3,3,".mm")==0) m_lmtb->load(inp,m_filePath.c_str(),NULL,1); else m_lmtb->load(inp,m_filePath.c_str(),NULL,0);#endif } m_lmtb_ng=new ngram(m_lmtb->getDict()); // ngram of words/micro tags m_lmtb_size=m_lmtb->maxlevel(); // LM can be ok, just outputs warnings // Mauro: in the original, the following two instructions are wrongly switched: m_unknownId = m_lmtb->getDict()->oovcode(); // at the level of micro tags CreateFactors(factorCollection); VERBOSE(1, "IRST: m_unknownId=" << m_unknownId << std::endl); //install caches m_lmtb->init_probcache(); m_lmtb->init_statecache(); m_lmtb->init_lmtcaches(m_lmtb->maxlevel()>2?m_lmtb->maxlevel()-1:2); return true;}void LanguageModelIRST::CreateFactors(FactorCollection &factorCollection){ // add factors which have srilm id // code copied & paste from SRI LM class. should do template function std::map<size_t, int> lmIdMap; size_t maxFactorId = 0; // to create lookup vector later on dict_entry *entry; dictionary_iter iter(m_lmtb->getDict()); // at the level of micro tags while ( (entry = iter.next()) != NULL) { size_t factorId = factorCollection.AddFactor(Output, m_factorType, entry->word)->GetId(); lmIdMap[factorId] = entry->code; maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; } size_t factorId; m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_); factorId = m_sentenceStart->GetId(); m_lmtb_sentenceStart=lmIdMap[factorId] = GetLmID(BOS_); maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; m_sentenceStartArray[m_factorType] = m_sentenceStart; m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_); factorId = m_sentenceEnd->GetId(); m_lmtb_sentenceEnd=lmIdMap[factorId] = GetLmID(EOS_); maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; m_sentenceEndArray[m_factorType] = m_sentenceEnd; // add to lookup vector in object m_lmIdLookup.resize(maxFactorId+1); fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_unknownId); map<size_t, int>::iterator iterMap; for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap) { m_lmIdLookup[iterMap->first] = iterMap->second; } }int LanguageModelIRST::GetLmID( const std::string &str ) const{ return m_lmtb->getDict()->encode( str.c_str() ); // at the level of micro tags}float LanguageModelIRST::GetValue(const vector<const Word*> &contextFactor, State* finalState, unsigned int* len) const{ unsigned int dummy; if (!len) { len = &dummy; } FactorType factorType = GetFactorType(); // set up context size_t count = contextFactor.size(); m_lmtb_ng->size=0; if (count< (size_t)(m_lmtb_size-1)) m_lmtb_ng->pushc(m_lmtb_sentenceEnd); if (count< (size_t)m_lmtb_size) m_lmtb_ng->pushc(m_lmtb_sentenceStart); for (size_t i = 0 ; i < count ; i++) { //int lmId = GetLmID((*contextFactor[i])[factorType]);#ifdef DEBUG cout << "i=" << i << " -> " << (*contextFactor[i])[factorType]->GetString() << "\n";#endif int lmId = GetLmID((*contextFactor[i])[factorType]->GetString()); m_lmtb_ng->pushc(lmId); } if (finalState){ *finalState=(State *)m_lmtb->cmaxsuffptr(*m_lmtb_ng); // back off stats not currently available *len = 0; } return TransformIRSTScore((float) m_lmtb->clprob(*m_lmtb_ng));}void LanguageModelIRST::CleanUpAfterSentenceProcessing(){ TRACE_ERR( "reset caches\n"); m_lmtb->reset_caches(); #ifndef WIN32 TRACE_ERR( "reset mmap\n"); m_lmtb->reset_mmap();#endif }void LanguageModelIRST::InitializeBeforeSentenceProcessing(){ //nothing to do#ifdef TRACE_CACHE m_lmtb->sentence_id++;#endif}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -