⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 lexicalreorderingtable.cpp.svn-base

📁 moses开源的机器翻译系统
💻 SVN-BASE
📖 第 1 页 / 共 2 页
字号:
#include "LexicalReorderingTable.h"#include "InputFileStream.h"//#include "LVoc.h" //need IPhrase#include "StaticData.h"#include "PhraseDictionary.h"#include "GenerationDictionary.h"#include "TargetPhrase.h"#include "TargetPhraseCollection.h"/*  * local helper functions *///cleans str of leading and tailing spacesstd::string auxClearString(const std::string& str){  int i = 0, j = str.size()-1;  while(i <= j){    if(' ' != str[i]){      break;    } else {      ++i;    }  }  while(j >= i){    if(' ' != str[j]){      break;    } else {      --j;    }  }  return str.substr(i,j-i+1);}void auxAppend(IPhrase& head, const IPhrase& tail){  head.reserve(head.size()+tail.size());  for(size_t i = 0; i < tail.size(); ++i){	head.push_back(tail[i]);  }}/*  * functions for LexicalReorderingTable */LexicalReorderingTable* LexicalReorderingTable::LoadAvailable(const std::string& filePath, const FactorList& f_factors, const FactorList& e_factors, const FactorList& c_factors){	//decide use Tree or Memory table	if(FileExists(filePath+".binlexr.idx")){	  //there exists a binary version use that	  return new LexicalReorderingTableTree(filePath, f_factors, e_factors, c_factors);	} else {	  //use plain memory	  return new LexicalReorderingTableMemory(filePath, f_factors, e_factors, c_factors);	}  }/*  * functions for LexicalReorderingTableMemory */LexicalReorderingTableMemory::LexicalReorderingTableMemory( 				const std::string& filePath,				const std::vector<FactorType>& f_factors, 				const std::vector<FactorType>& e_factors,				const std::vector<FactorType>& c_factors)  : LexicalReorderingTable(f_factors, e_factors, c_factors) {  LoadFromFile(filePath);}LexicalReorderingTableMemory::~LexicalReorderingTableMemory(){}std::vector<float>  LexicalReorderingTableMemory::GetScore(const Phrase& f,														   const Phrase& e,														   const Phrase& c) {  //rather complicated because of const can't use []... as [] might enter new things into std::map  //also can't have to be careful with words range if c is empty can't use c.GetSize()-1 will underflow and be large  TableType::const_iterator r;  std::string key;  if(0 == c.GetSize()){	key = MakeKey(f,e,c);	r = m_Table.find(key);	if(m_Table.end() != r){	  return r->second;	}  } else {	//right try from large to smaller context	for(size_t i = 0; i <= c.GetSize(); ++i){	  Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1)));	  key = MakeKey(f,e,sub_c);	  r = m_Table.find(key);	  if(m_Table.end() != r){		return r->second;	  }	}  }  return Score(); }void LexicalReorderingTableMemory::DbgDump(std::ostream* out) const{  TableType::const_iterator i;  for(i = m_Table.begin(); i != m_Table.end(); ++i){	*out << " key: '" << i->first << "' score: ";	*out << "(num scores: " << (i->second).size() << ")";	for(size_t j = 0; j < (i->second).size(); ++j){	  *out << (i->second)[j] << " ";	}	*out << "\n";  }};std::string  LexicalReorderingTableMemory::MakeKey(const Phrase& f, 												   const Phrase& e,												   const Phrase& c) const {  /*  std::string key;  if(!m_FactorsF.empty()){    key += f.GetStringRep(m_FactorsF);  }  if(!m_FactorsE.empty()){    if(!key.empty()){      key += " ||| ";    }    key += e.GetStringRep(m_FactorsE);  }  */  return MakeKey(auxClearString(f.GetStringRep(m_FactorsF)),				 auxClearString(e.GetStringRep(m_FactorsE)),				 auxClearString(c.GetStringRep(m_FactorsC)));}std::string  LexicalReorderingTableMemory::MakeKey(const std::string& f, 												   const std::string& e,												   const std::string& c) const{  std::string key;  if(!f.empty()){    key += f;  }  if(!m_FactorsE.empty()){    if(!key.empty()){      key += "|||";    }    key += e;  }  if(!m_FactorsC.empty()){    if(!key.empty()){      key += "|||";    }    key += c;  }  return key;}void  LexicalReorderingTableMemory::LoadFromFile(const std::string& filePath){  std::string fileName = filePath;  if(!FileExists(fileName) && FileExists(fileName+".gz")){	fileName += ".gz";  }  InputFileStream file(fileName);  std::string line(""), key("");  int numScores = -1;  std::cerr << "Loading table into memory...";  while(!getline(file, line).eof()){    std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");    int t = 0 ;     std::string f(""),e(""),c("");          if(!m_FactorsF.empty()){      //there should be something for f      f = auxClearString(tokens.at(t));      ++t;    }    if(!m_FactorsE.empty()){      //there should be something for e      e = auxClearString(tokens.at(t));      ++t;    }    if(!m_FactorsC.empty()){      //there should be something for c      c = auxClearString(tokens.at(t));      ++t;    }    //last token are the probs    std::vector<float> p = Scan<float>(Tokenize(tokens.at(t)));    //sanity check: all lines must have equall number of probs    if(-1 == numScores){      numScores = (int)p.size(); //set in first line    }    if((int)p.size() != numScores){      TRACE_ERR( "found inconsistent number of probabilities... found " << p.size() << " expected " << numScores << std::endl);      exit(0);    }    std::transform(p.begin(),p.end(),p.begin(),TransformScore);    //save it all into our map    m_Table[MakeKey(f,e,c)] = p;  }  std::cerr << "done.\n";}/*  * functions for LexicalReorderingTableTree */LexicalReorderingTableTree::LexicalReorderingTableTree(			    const std::string& filePath,			    const std::vector<FactorType>& f_factors, 				const std::vector<FactorType>& e_factors,			    const std::vector<FactorType>& c_factors)  : LexicalReorderingTable(f_factors, e_factors, c_factors) {  m_Table.Read(filePath+".binlexr"); }LexicalReorderingTableTree::~LexicalReorderingTableTree(){}Score LexicalReorderingTableTree::GetScore(const Phrase& f, const Phrase& e, const Phrase& c) {  if(   (!m_FactorsF.empty() && 0 == f.GetSize())     || (!m_FactorsE.empty() && 0 == e.GetSize())){    //NOTE: no check for c as c might be empty, e.g. start of sentence    //not a proper key    // phi: commented out, since e may be empty (drop-unknown)    //std::cerr << "Not a proper key!\n";    return Score();  }  CacheType::iterator i;;  if(m_UseCache){    std::pair<CacheType::iterator, bool> r = m_Cache.insert(std::make_pair(MakeCacheKey(f,e),Candidates()));    if(!r.second){      return auxFindScoreForContext((r.first)->second, c);    }    i = r.first;  } else if(!m_Cache.empty()) {     //although we might not be caching now, cache might be none empty!	i = m_Cache.find(MakeCacheKey(f,e));    if(i != m_Cache.end()){      return auxFindScoreForContext(i->second, c);	}  }  //not in cache go to file...  Score      score;  Candidates cands;   m_Table.GetCandidates(MakeTableKey(f,e), &cands);  if(cands.empty()){    return Score();  }   if(m_FactorsC.empty()){	assert(1 == cands.size());	return cands[0].GetScore(0);  } else {	score = auxFindScoreForContext(cands, c);  }  //cache for future use  if(m_UseCache){    i->second = cands;  }  return score;};Score LexicalReorderingTableTree::auxFindScoreForContext(const Candidates& cands, const Phrase& context){  if(m_FactorsC.empty()){	assert(cands.size() <= 1);	return (1 == cands.size())?(cands[0].GetScore(0)):(Score());  } else {	std::vector<std::string> cvec;	for(size_t i = 0; i < context.GetSize(); ++i){	  /* old code      std::string s = context.GetWord(i).ToString(m_FactorsC);	  cvec.push_back(s.substr(0,s.size()-1));      */	  cvec.push_back(context.GetWord(i).GetString(m_FactorsC, false));	}	IPhrase c = m_Table.ConvertPhrase(cvec,TargetVocId);	IPhrase sub_c;	IPhrase::iterator start = c.begin();	for(size_t j = 0; j <= context.GetSize(); ++j, ++start){	  sub_c.assign(start, c.end()); 	  for(size_t cand = 0; cand < cands.size(); ++cand){		IPhrase p = cands[cand].GetPhrase(0);		if(cands[cand].GetPhrase(0) == sub_c){		  return cands[cand].GetScore(0);		}	  }	}	return Score();  }}/*void LexicalReorderingTableTree::DbgDump(std::ostream* pout){  std::ostream& out = *pout;   //TODO!  }*/void LexicalReorderingTableTree::InitializeForInput(const InputType& input){  ClearCache();  if(ConfusionNet const* cn = dynamic_cast<ConfusionNet const*>(&input)){    Cache(*cn);  } else if(Sentence const* s = dynamic_cast<Sentence const*>(&input)){    // Cache(*s); ... this just takes up too much memory, we cache elsewhere    DisableCache();  }}; bool LexicalReorderingTableTree::Create(std::istream& inFile,                                         const std::string& outFileName){  std::string line;  //TRACE_ERR("Entering Create...\n");    std::string     ofn(outFileName+".binlexr.srctree"),    oft(outFileName+".binlexr.tgtdata"),    ofi(outFileName+".binlexr.idx"),    ofsv(outFileName+".binlexr.voc0"),	oftv(outFileName+".binlexr.voc1");    FILE *os = fOpen(ofn.c_str(),"wb");  FILE *ot = fOpen(oft.c_str(),"wb");  //TRACE_ERR("opend files....\n");  typedef PrefixTreeSA<LabelId,OFF_T> PSA;  PSA *psa = new PSA;  PSA::setDefault(InvalidOffT);  WordVoc* voc[3];      LabelId currFirstWord = InvalidLabelId;  IPhrase currKey;  Candidates         cands;  std::vector<OFF_T> vo;  size_t lnc = 0;  size_t numTokens    = 0;  size_t numKeyTokens = 0;  while(getline(inFile, line)){	//TRACE_ERR(lnc<<":"<<line<<"\n");    ++lnc;	if(0 == lnc % 10000){

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -