📄 lexicalreorderingtable.cpp.svn-base
字号:
#include "LexicalReorderingTable.h"#include "InputFileStream.h"//#include "LVoc.h" //need IPhrase#include "StaticData.h"#include "PhraseDictionary.h"#include "GenerationDictionary.h"#include "TargetPhrase.h"#include "TargetPhraseCollection.h"/* * local helper functions *///cleans str of leading and tailing spacesstd::string auxClearString(const std::string& str){ int i = 0, j = str.size()-1; while(i <= j){ if(' ' != str[i]){ break; } else { ++i; } } while(j >= i){ if(' ' != str[j]){ break; } else { --j; } } return str.substr(i,j-i+1);}void auxAppend(IPhrase& head, const IPhrase& tail){ head.reserve(head.size()+tail.size()); for(size_t i = 0; i < tail.size(); ++i){ head.push_back(tail[i]); }}/* * functions for LexicalReorderingTable */LexicalReorderingTable* LexicalReorderingTable::LoadAvailable(const std::string& filePath, const FactorList& f_factors, const FactorList& e_factors, const FactorList& c_factors){ //decide use Tree or Memory table if(FileExists(filePath+".binlexr.idx")){ //there exists a binary version use that return new LexicalReorderingTableTree(filePath, f_factors, e_factors, c_factors); } else { //use plain memory return new LexicalReorderingTableMemory(filePath, f_factors, e_factors, c_factors); } }/* * functions for LexicalReorderingTableMemory */LexicalReorderingTableMemory::LexicalReorderingTableMemory( const std::string& filePath, const std::vector<FactorType>& f_factors, const std::vector<FactorType>& e_factors, const std::vector<FactorType>& c_factors) : LexicalReorderingTable(f_factors, e_factors, c_factors) { LoadFromFile(filePath);}LexicalReorderingTableMemory::~LexicalReorderingTableMemory(){}std::vector<float> LexicalReorderingTableMemory::GetScore(const Phrase& f, const Phrase& e, const Phrase& c) { //rather complicated because of const can't use []... as [] might enter new things into std::map //also can't have to be careful with words range if c is empty can't use c.GetSize()-1 will underflow and be large TableType::const_iterator r; std::string key; if(0 == c.GetSize()){ key = MakeKey(f,e,c); r = m_Table.find(key); if(m_Table.end() != r){ return r->second; } } else { //right try from large to smaller context for(size_t i = 0; i <= c.GetSize(); ++i){ Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1))); key = MakeKey(f,e,sub_c); r = m_Table.find(key); if(m_Table.end() != r){ return r->second; } } } return Score(); }void LexicalReorderingTableMemory::DbgDump(std::ostream* out) const{ TableType::const_iterator i; for(i = m_Table.begin(); i != m_Table.end(); ++i){ *out << " key: '" << i->first << "' score: "; *out << "(num scores: " << (i->second).size() << ")"; for(size_t j = 0; j < (i->second).size(); ++j){ *out << (i->second)[j] << " "; } *out << "\n"; }};std::string LexicalReorderingTableMemory::MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const { /* std::string key; if(!m_FactorsF.empty()){ key += f.GetStringRep(m_FactorsF); } if(!m_FactorsE.empty()){ if(!key.empty()){ key += " ||| "; } key += e.GetStringRep(m_FactorsE); } */ return MakeKey(auxClearString(f.GetStringRep(m_FactorsF)), auxClearString(e.GetStringRep(m_FactorsE)), auxClearString(c.GetStringRep(m_FactorsC)));}std::string LexicalReorderingTableMemory::MakeKey(const std::string& f, const std::string& e, const std::string& c) const{ std::string key; if(!f.empty()){ key += f; } if(!m_FactorsE.empty()){ if(!key.empty()){ key += "|||"; } key += e; } if(!m_FactorsC.empty()){ if(!key.empty()){ key += "|||"; } key += c; } return key;}void LexicalReorderingTableMemory::LoadFromFile(const std::string& filePath){ std::string fileName = filePath; if(!FileExists(fileName) && FileExists(fileName+".gz")){ fileName += ".gz"; } InputFileStream file(fileName); std::string line(""), key(""); int numScores = -1; std::cerr << "Loading table into memory..."; while(!getline(file, line).eof()){ std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||"); int t = 0 ; std::string f(""),e(""),c(""); if(!m_FactorsF.empty()){ //there should be something for f f = auxClearString(tokens.at(t)); ++t; } if(!m_FactorsE.empty()){ //there should be something for e e = auxClearString(tokens.at(t)); ++t; } if(!m_FactorsC.empty()){ //there should be something for c c = auxClearString(tokens.at(t)); ++t; } //last token are the probs std::vector<float> p = Scan<float>(Tokenize(tokens.at(t))); //sanity check: all lines must have equall number of probs if(-1 == numScores){ numScores = (int)p.size(); //set in first line } if((int)p.size() != numScores){ TRACE_ERR( "found inconsistent number of probabilities... found " << p.size() << " expected " << numScores << std::endl); exit(0); } std::transform(p.begin(),p.end(),p.begin(),TransformScore); //save it all into our map m_Table[MakeKey(f,e,c)] = p; } std::cerr << "done.\n";}/* * functions for LexicalReorderingTableTree */LexicalReorderingTableTree::LexicalReorderingTableTree( const std::string& filePath, const std::vector<FactorType>& f_factors, const std::vector<FactorType>& e_factors, const std::vector<FactorType>& c_factors) : LexicalReorderingTable(f_factors, e_factors, c_factors) { m_Table.Read(filePath+".binlexr"); }LexicalReorderingTableTree::~LexicalReorderingTableTree(){}Score LexicalReorderingTableTree::GetScore(const Phrase& f, const Phrase& e, const Phrase& c) { if( (!m_FactorsF.empty() && 0 == f.GetSize()) || (!m_FactorsE.empty() && 0 == e.GetSize())){ //NOTE: no check for c as c might be empty, e.g. start of sentence //not a proper key // phi: commented out, since e may be empty (drop-unknown) //std::cerr << "Not a proper key!\n"; return Score(); } CacheType::iterator i;; if(m_UseCache){ std::pair<CacheType::iterator, bool> r = m_Cache.insert(std::make_pair(MakeCacheKey(f,e),Candidates())); if(!r.second){ return auxFindScoreForContext((r.first)->second, c); } i = r.first; } else if(!m_Cache.empty()) { //although we might not be caching now, cache might be none empty! i = m_Cache.find(MakeCacheKey(f,e)); if(i != m_Cache.end()){ return auxFindScoreForContext(i->second, c); } } //not in cache go to file... Score score; Candidates cands; m_Table.GetCandidates(MakeTableKey(f,e), &cands); if(cands.empty()){ return Score(); } if(m_FactorsC.empty()){ assert(1 == cands.size()); return cands[0].GetScore(0); } else { score = auxFindScoreForContext(cands, c); } //cache for future use if(m_UseCache){ i->second = cands; } return score;};Score LexicalReorderingTableTree::auxFindScoreForContext(const Candidates& cands, const Phrase& context){ if(m_FactorsC.empty()){ assert(cands.size() <= 1); return (1 == cands.size())?(cands[0].GetScore(0)):(Score()); } else { std::vector<std::string> cvec; for(size_t i = 0; i < context.GetSize(); ++i){ /* old code std::string s = context.GetWord(i).ToString(m_FactorsC); cvec.push_back(s.substr(0,s.size()-1)); */ cvec.push_back(context.GetWord(i).GetString(m_FactorsC, false)); } IPhrase c = m_Table.ConvertPhrase(cvec,TargetVocId); IPhrase sub_c; IPhrase::iterator start = c.begin(); for(size_t j = 0; j <= context.GetSize(); ++j, ++start){ sub_c.assign(start, c.end()); for(size_t cand = 0; cand < cands.size(); ++cand){ IPhrase p = cands[cand].GetPhrase(0); if(cands[cand].GetPhrase(0) == sub_c){ return cands[cand].GetScore(0); } } } return Score(); }}/*void LexicalReorderingTableTree::DbgDump(std::ostream* pout){ std::ostream& out = *pout; //TODO! }*/void LexicalReorderingTableTree::InitializeForInput(const InputType& input){ ClearCache(); if(ConfusionNet const* cn = dynamic_cast<ConfusionNet const*>(&input)){ Cache(*cn); } else if(Sentence const* s = dynamic_cast<Sentence const*>(&input)){ // Cache(*s); ... this just takes up too much memory, we cache elsewhere DisableCache(); }}; bool LexicalReorderingTableTree::Create(std::istream& inFile, const std::string& outFileName){ std::string line; //TRACE_ERR("Entering Create...\n"); std::string ofn(outFileName+".binlexr.srctree"), oft(outFileName+".binlexr.tgtdata"), ofi(outFileName+".binlexr.idx"), ofsv(outFileName+".binlexr.voc0"), oftv(outFileName+".binlexr.voc1"); FILE *os = fOpen(ofn.c_str(),"wb"); FILE *ot = fOpen(oft.c_str(),"wb"); //TRACE_ERR("opend files....\n"); typedef PrefixTreeSA<LabelId,OFF_T> PSA; PSA *psa = new PSA; PSA::setDefault(InvalidOffT); WordVoc* voc[3]; LabelId currFirstWord = InvalidLabelId; IPhrase currKey; Candidates cands; std::vector<OFF_T> vo; size_t lnc = 0; size_t numTokens = 0; size_t numKeyTokens = 0; while(getline(inFile, line)){ //TRACE_ERR(lnc<<":"<<line<<"\n"); ++lnc; if(0 == lnc % 10000){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -