📄 pdtaimp.h.svn-base
字号:
friend std::ostream& operator<<(std::ostream& out,State const& s) { out<<" R=("<<s.begin()<<","<<s.end()<<"),SC=("<<s.GetScore()<<","<<s.realWords<<")"; return out; } }; void CreateTargetPhrase(TargetPhrase& targetPhrase, StringTgtCand::first_type const& factorStrings, StringTgtCand::second_type const& scoreVector, Phrase const* srcPtr=0) const { FactorCollection &factorCollection = FactorCollection::Instance(); for(size_t k=0;k<factorStrings.size();++k) { std::vector<std::string> factors=TokenizeMultiCharSeparator(*factorStrings[k],StaticData::Instance().GetFactorDelimiter()); Word& w=targetPhrase.AddWord(); for(size_t l=0;l<m_output.size();++l) w[m_output[l]]= factorCollection.AddFactor(Output, m_output[l], factors[l]); } targetPhrase.SetScore(m_obj, scoreVector, m_weights, m_weightWP, *m_languageModels); targetPhrase.SetSourcePhrase(srcPtr); } TargetPhraseCollection* PruneTargetCandidates(std::vector<TargetPhrase> const & tCands, std::vector<std::pair<float,size_t> >& costs) const { // convert into TargetPhraseCollection TargetPhraseCollection *rv=new TargetPhraseCollection; // set limit to tableLimit or actual size, whatever is smaller std::vector<std::pair<float,size_t> >::iterator nth = costs.begin() + ((m_obj->m_tableLimit>0 && // 0 indicates no limit m_obj->m_tableLimit < costs.size()) ? m_obj->m_tableLimit : costs.size()); // find the nth phrase according to future cost std::nth_element(costs.begin(),nth ,costs.end()); // add n top phrases to the return list for(std::vector<std::pair<float,size_t> >::iterator it = costs.begin(); it != nth; ++it) rv->Add(new TargetPhrase(tCands[it->second])); return rv; } // POD for target phrase scores struct TScores { float total; StringTgtCand::second_type trans; Phrase const* src; TScores() : total(0.0),src(0) {} }; void CacheSource(ConfusionNet const& src) { assert(m_dict); const size_t srcSize=src.GetSize(); std::vector<size_t> exploredPaths(srcSize+1,0); std::vector<double> exPathsD(srcSize+1,-1.0); // collect some statistics std::vector<size_t> cnDepths(srcSize,0); for(size_t i=0;i<srcSize;++i) cnDepths[i]=src[i].size(); for(size_t len=1;len<=srcSize;++len) for(size_t i=0;i<=srcSize-len;++i) { double pd=0.0; for(size_t k=i;k<i+len;++k) pd+=log(1.0*cnDepths[k]); exPathsD[len]=(exPathsD[len]>=0.0 ? addLogScale(pd,exPathsD[len]) : pd); } // update global statistics if(pathCN.size()<=srcSize) pathCN.resize(srcSize+1,-1.0); for(size_t len=1;len<=srcSize;++len) pathCN[len]=pathCN[len]>=0.0 ? addLogScale(pathCN[len],exPathsD[len]) : exPathsD[len]; if(path1Best.size()<=srcSize) path1Best.resize(srcSize+1,0); for(size_t len=1;len<=srcSize;++len) path1Best[len]+=srcSize-len+1; if (StaticData::Instance().GetVerboseLevel() >= 2 && exPathsD.size()) { TRACE_ERR("path stats for current CN: \nCN (full): "); std::transform(exPathsD.begin()+1 ,exPathsD.end() ,std::ostream_iterator<double>(std::cerr," ") ,Exp); TRACE_ERR("\n"); } typedef StringTgtCand::first_type sPhrase; typedef std::map<StringTgtCand::first_type,TScores> E2Costs; std::map<Range,E2Costs> cov2cand; std::vector<State> stack; for(Position i=0 ; i < srcSize ; ++i) stack.push_back(State(i, i, m_dict->GetRoot())); while(!stack.empty()) { State curr(stack.back()); stack.pop_back(); assert(curr.end()<srcSize); const ConfusionNet::Column &currCol=src[curr.end()]; // in a given column, loop over all possibilities for(size_t colidx=0;colidx<currCol.size();++colidx) { const Word& w=currCol[colidx].first; // w=the i^th possibility in column colidx std::string s; Factors2String(w,s); bool isEpsilon=(s=="" || s==EPSILON); // do not start with epsilon (except at first position) if(isEpsilon && curr.begin()==curr.end() && curr.begin()>0) continue; // At a given node in the prefix tree, look to see if w defines an edge to // another node (Extend). Stay at the same node if w==EPSILON PPtr nextP = (isEpsilon ? curr.ptr : m_dict->Extend(curr.ptr,s)); unsigned newRealWords=curr.realWords + (isEpsilon ? 0 : 1); if(nextP) // w is a word that should be considered { Range newRange(curr.begin(),curr.end()+src.GetColumnIncrement(curr.end(),colidx)); float newScore=curr.GetScore()+currCol[colidx].second; // CN score Phrase newSrc(curr.src); if(!isEpsilon) newSrc.AddWord(w); if(newRange.second<srcSize && newScore>LOWEST_SCORE) { // if there is more room to grow, add a new state onto the queue // to be explored that represents [begin, curEnd+) stack.push_back(State(newRange,nextP,newScore,newRealWords)); stack.back().src=newSrc; } std::vector<StringTgtCand> tcands; // now, look up the target candidates (aprx. TargetPhraseCollection) for // the current path through the CN m_dict->GetTargetCandidates(nextP,tcands); if(newRange.second>=exploredPaths.size()+newRange.first) exploredPaths.resize(newRange.second-newRange.first+1,0); ++exploredPaths[newRange.second-newRange.first]; totalE+=tcands.size(); if(tcands.size()) { E2Costs& e2costs=cov2cand[newRange]; Phrase const* srcPtr=uniqSrcPhr(newSrc); for(size_t i=0;i<tcands.size();++i) { std::vector<float> nscores(tcands[i].second.size()+m_numInputScores,0.0); switch(m_numInputScores) { case 2: nscores[1]= -1.0f * newRealWords; // do not use -newRealWords ! -- RZ case 1: nscores[0]= newScore; case 0: break; default: TRACE_ERR("ERROR: too many model scaling factors for input weights 'weight-i' : "<<m_numInputScores<<"\n"); abort(); } std::transform(tcands[i].second.begin(),tcands[i].second.end(),nscores.begin() + m_numInputScores,TransformScore); assert(nscores.size()==m_weights.size()); float score=std::inner_product(nscores.begin(), nscores.end(), m_weights.begin(), 0.0f); score-=tcands[i].first.size() * m_weightWP; std::pair<E2Costs::iterator,bool> p=e2costs.insert(std::make_pair(tcands[i].first,TScores())); if(p.second) ++distinctE; TScores & scores=p.first->second; if(p.second || scores.total<score) { scores.total=score; scores.trans=nscores; scores.src=srcPtr; } } } } } } // end while(!stack.empty()) if (StaticData::Instance().GetVerboseLevel() >= 2 && exploredPaths.size()) { TRACE_ERR("CN (explored): "); std::copy(exploredPaths.begin()+1,exploredPaths.end(), std::ostream_iterator<size_t>(std::cerr," ")); TRACE_ERR("\n"); } if(pathExplored.size()<exploredPaths.size()) pathExplored.resize(exploredPaths.size(),0); for(size_t len=1;len<=srcSize;++len) pathExplored[len]+=exploredPaths[len]; m_rangeCache.resize(src.GetSize(),vTPC(src.GetSize(),0)); for(std::map<Range,E2Costs>::const_iterator i=cov2cand.begin();i!=cov2cand.end();++i) { assert(i->first.first<m_rangeCache.size()); assert(i->first.second>0); assert(static_cast<size_t>(i->first.second-1)<m_rangeCache[i->first.first].size()); assert(m_rangeCache[i->first.first][i->first.second-1]==0); std::vector<TargetPhrase> tCands;tCands.reserve(i->second.size()); std::vector<std::pair<float,size_t> > costs;costs.reserve(i->second.size()); for(E2Costs::const_iterator j=i->second.begin();j!=i->second.end();++j) { TScores const & scores=j->second; TargetPhrase targetPhrase(Output); CreateTargetPhrase(targetPhrase,j->first,scores.trans,scores.src); costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size())); tCands.push_back(targetPhrase); //std::cerr << i->first.first << "-" << i->first.second << ": " << targetPhrase << std::endl; } TargetPhraseCollection *rv=PruneTargetCandidates(tCands,costs); if(rv->IsEmpty()) delete rv; else { m_rangeCache[i->first.first][i->first.second-1]=rv; m_tgtColls.push_back(rv); } } // free memory m_dict->FreeMemory(); } size_t GetNumInputScores() const {return m_numInputScores;}};
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -