📄 lexicalreorderingtable.cpp.svn-base
字号:
TRACE_ERR("."); } IPhrase key; Score score; std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||"); std::string w; if(1 == lnc){ //do some init stuff in the first line numTokens = tokens.size(); if(tokens.size() == 2){ //f ||| score numKeyTokens = 1; voc[0] = new WordVoc(); voc[1] = 0; } else if(3 == tokens.size() || 4 == tokens.size()){ //either f ||| e ||| score or f ||| e ||| c ||| score numKeyTokens = 2; voc[0] = new WordVoc(); //f voc voc[1] = new WordVoc(); //e voc voc[2] = voc[1]; //c & e share voc } } else { //sanity check ALL lines must have same number of tokens assert(numTokens == tokens.size()); } int phrase = 0; for(; phrase < numKeyTokens; ++phrase){ //conditioned on more than just f... need ||| if(phrase >=1){ key.push_back(PrefixTreeMap::MagicWord); } std::istringstream is(tokens[phrase]); while(is >> w) { key.push_back(voc[phrase]->add(w)); } } //collect all non key phrases, i.e. c std::vector<IPhrase> tgt_phrases; tgt_phrases.resize(numTokens - numKeyTokens - 1); for(int j = 0; j < tgt_phrases.size(); ++j, ++phrase){ std::istringstream is(tokens[numKeyTokens + j]); while(is >> w) { tgt_phrases[j].push_back(voc[phrase]->add(w)); } } //last token is score std::istringstream is(tokens[numTokens-1]); while(is >> w) { score.push_back(atof(w.c_str())); } //transform score now... std::transform(score.begin(),score.end(),score.begin(),TransformScore); std::vector<Score> scores; scores.push_back(score); if(key.empty()) { TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n"); continue; } //first time inits if(currFirstWord == InvalidLabelId){ currFirstWord = key[0]; } if(currKey.empty()){ currKey = key; //insert key into tree assert(psa); PSA::Data& d = psa->insert(key); if(d == InvalidOffT) { d = fTell(ot); } else { TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" << line << "\n"); return false; } } if(currKey != key){ //ok new key currKey = key; //a) write cands for old key cands.writeBin(ot); cands.clear(); //b) check if we need to move on to new tree root if(key[0] != currFirstWord){ // write key prefix tree to file and clear PTF pf; if(currFirstWord >= vo.size()){ vo.resize(currFirstWord+1,InvalidOffT); } vo[currFirstWord] = fTell(os); pf.create(*psa, os); // clear delete psa; psa = new PSA; currFirstWord = key[0]; } //c) insert key into tree assert(psa); PSA::Data& d = psa->insert(key); if(d == InvalidOffT) { d = fTell(ot); } else { TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" << line << "\n"); return false; } } cands.push_back(GenericCandidate(tgt_phrases, scores)); } //flush remainders cands.writeBin(ot); cands.clear(); //process last currFirstWord PTF pf; if(currFirstWord >= vo.size()) { vo.resize(currFirstWord+1,InvalidOffT); } vo[currFirstWord] = fTell(os); pf.create(*psa,os); delete psa; psa=0; fClose(os); fClose(ot); /* std::vector<size_t> inv; for(size_t i = 0; i < vo.size(); ++i){ if(vo[i] == InvalidOffT){ inv.push_back(i); } } if(inv.size()) { TRACE_ERR("WARNING: there are src voc entries with no phrase " "translation: count "<<inv.size()<<"\n" "There exists phrase translations for "<<vo.size()-inv.size() <<" entries\n"); } */ FILE *oi = fOpen(ofi.c_str(),"wb"); fWriteVector(oi,vo); fClose(oi); if(voc[0]){ voc[0]->Write(ofsv); delete voc[0]; } if(voc[1]){ voc[1]->Write(oftv); delete voc[1]; } return true;}std::string LexicalReorderingTableTree::MakeCacheKey(const Phrase& f, const Phrase& e) const { std::string key; if(!m_FactorsF.empty()){ key += auxClearString(f.GetStringRep(m_FactorsF)); } if(!m_FactorsE.empty()){ if(!key.empty()){ key += "|||"; } key += auxClearString(e.GetStringRep(m_FactorsE)); } return key;};IPhrase LexicalReorderingTableTree::MakeTableKey(const Phrase& f, const Phrase& e) const { IPhrase key; std::vector<std::string> keyPart; if(!m_FactorsF.empty()){ for(int i = 0; i < f.GetSize(); ++i){ /* old code std::string s = f.GetWord(i).ToString(m_FactorsF); keyPart.push_back(s.substr(0,s.size()-1)); */ keyPart.push_back(f.GetWord(i).GetString(m_FactorsF, false)); } auxAppend(key, m_Table.ConvertPhrase(keyPart, SourceVocId)); keyPart.clear(); } if(!m_FactorsE.empty()){ if(!key.empty()){ key.push_back(PrefixTreeMap::MagicWord); } for(int i = 0; i < e.GetSize(); ++i){ /* old code std::string s = e.GetWord(i).ToString(m_FactorsE); keyPart.push_back(s.substr(0,s.size()-1)); */ keyPart.push_back(e.GetWord(i).GetString(m_FactorsE, false)); } auxAppend(key, m_Table.ConvertPhrase(keyPart,TargetVocId)); //keyPart.clear(); } return key;};struct State { State(PPimp* t, const std::string& p) : pos(t), path(p){ } PPimp* pos; std::string path;};void LexicalReorderingTableTree::auxCacheForSrcPhrase(const Phrase& f){ if(m_FactorsE.empty()){ //f is all of key... Candidates cands; m_Table.GetCandidates(MakeTableKey(f,Phrase(Output)),&cands); m_Cache[MakeCacheKey(f,Phrase(Output))] = cands; } else { ObjectPool<PPimp> pool; PPimp* pPos = m_Table.GetRoot(); //1) goto subtree for f for(int i = 0; i < f.GetSize() && 0 != pPos && pPos->isValid(); ++i){ /* old code pPos = m_Table.Extend(pPos, auxClearString(f.GetWord(i).ToString(m_FactorsF)), SourceVocId); */ pPos = m_Table.Extend(pPos, f.GetWord(i).GetString(m_FactorsF, false), SourceVocId); } if(0 != pPos && pPos->isValid()){ pPos = m_Table.Extend(pPos, PrefixTreeMap::MagicWord); } if(0 == pPos || !pPos->isValid()){ return; } //2) explore whole subtree depth first & cache std::string cache_key = auxClearString(f.GetStringRep(m_FactorsF)) + "|||"; std::vector<State> stack; stack.push_back(State(pool.get(PPimp(pPos->ptr()->getPtr(pPos->idx),0,0)),"")); Candidates cands; while(!stack.empty()){ if(stack.back().pos->isValid()){ LabelId w = stack.back().pos->ptr()->getKey(stack.back().pos->idx); std::string next_path = stack.back().path + " " + m_Table.ConvertWord(w,TargetVocId); //cache this m_Table.GetCandidates(*stack.back().pos,&cands); if(!cands.empty()){ m_Cache[cache_key + auxClearString(next_path)] = cands; } cands.clear(); PPimp* next_pos = pool.get(PPimp(stack.back().pos->ptr()->getPtr(stack.back().pos->idx),0,0)); ++stack.back().pos->idx; stack.push_back(State(next_pos,next_path)); } else { stack.pop_back(); } } }}void LexicalReorderingTableTree::Cache(const ConfusionNet& input){ return;}void LexicalReorderingTableTree::Cache(const Sentence& input){ //only works with sentences... int prev_cache_size = m_Cache.size(); int max_phrase_length = input.GetSize(); for(size_t len = 0; len <= max_phrase_length; ++len){ for(size_t start = 0; start+len <= input.GetSize(); ++start){ Phrase f = input.GetSubString(WordsRange(start, start+len)); auxCacheForSrcPhrase(f); } } std::cerr << "Cached " << m_Cache.size() - prev_cache_size << " new primary reordering table keys\n"; }/*Pre fetching implementation using Phrase and Generation Dictionaries *//*void LexicalReorderingTableTree::Cache(const ConfusionNet& input){ typedef TargetPhraseCollection::iterator Iter; typedef TargetPhraseCollection::const_iterator ConstIter; //not implemented for confusion networks... Sentence const* s = dynamic_cast<Sentence const*>(&input); if(!s){ return; } int max_phrase_length = input.GetSize(); std::vector<PhraseDictionaryBase*> PhraseTables = StaticData::Instance()->GetPhraseDictionaries(); //new code: //std::vector<PhraseDictionary*> PhraseTables = StaticData::Instance()->GetPhraseDictionaries(); std::vector<GenerationDictionary*> GenTables = StaticData::Instance()->GetGenerationDictionaries(); for(size_t len = 1; len <= max_phrase_length; ++len){ for(size_t start = 0; start+len <= input.GetSize(); ++start){ Phrase f = s->GetSubString(WordsRange(start, start+len)); //find all translations of f TargetPhraseCollection list; for(size_t t = 0; t < PhraseTables.size(); ++t){ //if(doIntersect(PhraseTables[t]->GetOutputFactorMask(),FactorMask(m_FactorsE))){ //this table gives us something we need const TargetPhraseCollection* new_list = PhraseTables[t]->GetTargetPhraseCollection(f); TargetPhraseCollection curr_list; for(ConstIter i = new_list->begin(); i != new_list->end(); ++i){ for(Iter j = list.begin(); j != list.end(); ++j){ curr_list.Add((*j)->MergeNext(*(*i))); } } if(list.IsEmpty()){ list = *new_list; } else { list = curr_list; } //} } for(size_t g = 0; g < GenTables.size(); ++g){ //if(doIntersect(GenTables[g]->GetOutputFactorMask(),FactorMask(m_FactorsE))){ TargetPhraseCollection curr_list; for(Iter j = list.begin(); j != list.end(); ++j){ for(size_t w = 0; w < (*j)->GetSize(); ++w){ const OutputWordCollection* words = GenTables[g]->FindWord((*j)->GetWord(w)); for(OutputWordCollection::const_iterator i = words->begin(); i != words->end(); ++i){ TargetPhrase* p = new TargetPhrase(*(*j)); Word& pw = p->GetWord(w); pw.Merge(i->first); curr_list.Add(p); } } } list = curr_list; //} } //cache for each translation for(Iter e = list.begin(); e < list.end(); ++e){ Candidates cands; m_Table.GetCandidates(MakeTableKey(f,*(*e)), &cands); m_Cache.insert(std::make_pair(MakeCacheKey(f,*(*e)),cands)); } } }};*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -