⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 generatetuples.cpp.svn-base

📁 moses开源的机器翻译系统
💻 SVN-BASE
字号:
//////////////////////////////////////////////////////////////// generate set of target candidates for confusion net//////////////////////////////////////////////////////////////#include <numeric>#include "Word.h"#include "Phrase.h"#include "ConfusionNet.h"#include "WordsRange.h"#include "PhraseDictionaryTree.h"#if 0// Generates all tuples from  n indexes with ranges 0 to card[j]-1, respectively..// Input: number of indexes and  ranges: ranges[0] ... ranges[num_idx-1] // Output: number of tuples and monodimensional array of tuples.// Reference: mixed-radix generation algorithm (D. E. Knuth, TAOCP v. 4.2)size_t GenerateTuples(unsigned num_idx,unsigned* ranges,unsigned *&tuples){  unsigned* single_tuple= new unsigned[num_idx+1];  unsigned num_tuples=1;  for (unsigned k=0;k<num_idx;++k)    {      num_tuples *= ranges[k];      single_tuple[k]=0;    }  tuples=new unsigned[num_idx * num_tuples];  // we need this additional element for the last iteration  single_tuple[num_idx]=0;   unsigned j=0;  for (unsigned n=0;n<num_tuples;++n){    memcpy((void *)((tuples + n * num_idx)),(void *)single_tuple,num_idx * sizeof(unsigned));    j=0;    while (single_tuple[j]==ranges[j]-1){single_tuple[j]=0; ++j;}    ++single_tuple[j];  }	delete [] single_tuple;  return num_tuples;}typedef PhraseDictionaryTree::PrefixPtr PPtr;typedef std::vector<PPtr> vPPtr;typedef std::vector<std::vector<Factor const*> > mPhrase;std::ostream& operator<<(std::ostream& out,const mPhrase& p) {	for(size_t i=0;i<p.size();++i) {		out<<i<<" - ";		for(size_t j=0;j<p[i].size();++j)			out<<p[i][j]->ToString()<<" ";		out<<"|";	}	return out;}struct State {	vPPtr ptrs;	WordsRange range;	float score;	State() : range(0,0),score(0.0) {}	State(size_t b,size_t e,const vPPtr& v,float sc=0.0) : ptrs(v),range(b,e),score(sc) {}		size_t begin() const {return range.GetStartPos();}	size_t end() const {return range.GetEndPos();}	float GetScore() const {return score;}};std::ostream& operator<<(std::ostream& out,const State& s) {	out<<"["<<s.ptrs.size()<<" ("<<s.begin()<<","<<s.end()<<") "<<s.GetScore()<<"]";	return out;}typedef std::map<mPhrase,float> E2Costs;struct GCData {	const std::vector<PhraseDictionaryTree const*>& pdicts;	const std::vector<std::vector<float> >& weights;	std::vector<FactorType> inF,outF;	size_t distinctOutputFactors;	vPPtr root;	size_t totalTuples,distinctTuples;	GCData(const std::vector<PhraseDictionaryTree const*>& a,				 const std::vector<std::vector<float> >& b) 		: pdicts(a),weights(b),totalTuples(0),distinctTuples(0) {		assert(pdicts.size()==weights.size());		std::set<FactorType> distinctOutFset;		inF.resize(pdicts.size());		outF.resize(pdicts.size());		root.resize(pdicts.size());		for(size_t i=0;i<pdicts.size();++i) 			{				root[i]=pdicts[i]->GetRoot();				inF[i]=pdicts[i]->GetInputFactorType();				outF[i]=pdicts[i]->GetOutputFactorType();				distinctOutFset.insert(pdicts[i]->GetOutputFactorType());			}		distinctOutputFactors=distinctOutFset.size();	}	FactorType OutFT(size_t i) const {return outF[i];}	FactorType InFT(size_t i) const {return inF[i];}	size_t DistinctOutFactors() const {return distinctOutputFactors;}	const vPPtr& GetRoot() const {return root;}};typedef std::vector<Factor const*> vFactor;typedef std::vector<std::pair<float,vFactor> > TgtCandList;typedef std::vector<TgtCandList> OutputFactor2TgtCandList;typedef std::vector<OutputFactor2TgtCandList*> Len2Cands;void GeneratePerFactorTgtList(size_t factorType,PPtr pptr,GCData& data,Len2Cands& len2cands){	std::vector<FactorTgtCand> cands;	data.pdicts[factorType]->GetTargetCandidates(pptr,cands);	for(std::vector<FactorTgtCand>::const_iterator cand=cands.begin();cand!=cands.end();++cand) {		assert(data.weights[factorType].size()==cand->second.size());		float costs=std::inner_product(data.weights[factorType].begin(),																	 data.weights[factorType].end(),																	 cand->second.begin(),																	 0.0);		size_t len=cand->first.size();		if(len>=len2cands.size()) len2cands.resize(len+1,0);		if(!len2cands[len]) len2cands[len]=new OutputFactor2TgtCandList(data.DistinctOutFactors());		OutputFactor2TgtCandList &outf2tcandlist=*len2cands[len];		outf2tcandlist[data.OutFT(factorType)].push_back(std::make_pair(costs,cand->first));	}}void GenerateTupleTgtCands(OutputFactor2TgtCandList& tCand,E2Costs& e2costs,GCData& data) {	// check if candidates are non-empty	bool gotCands=1;	for(size_t j=0;gotCands && j<tCand.size();++j)		gotCands &= !tCand[j].empty();					if(gotCands) {		// enumerate tuples		assert(data.DistinctOutFactors()==tCand.size());		std::vector<unsigned> radix(data.DistinctOutFactors());		for(size_t i=0;i<tCand.size();++i) radix[i]=tCand[i].size();		unsigned *tuples=0;		size_t numTuples=GenerateTuples(radix.size(),&radix[0],tuples);		data.totalTuples+=numTuples;		for(size_t i=0;i<numTuples;++i)			{				mPhrase e(radix.size());float costs=0.0;				for(size_t j=0;j<radix.size();++j)					{						assert(tuples[radix.size()*i+j]<tCand[j].size());						std::pair<float,vFactor> const& mycand=tCand[j][tuples[radix.size()*i+j]];						e[j]=mycand.second;						costs+=mycand.first;					}#ifdef DEBUG				bool mismatch=0;				for(size_t j=1;!mismatch && j<e.size();++j)					if(e[j].size()!=e[j-1].size()) mismatch=1;				assert(mismatch==0);#endif				std::pair<E2Costs::iterator,bool> p=e2costs.insert(std::make_pair(e,costs));				if(p.second) ++data.distinctTuples;				else {					// entry known, take min of costs, alternative: sum probs					if(costs<p.first->second) p.first->second=costs;				}			}		delete [] tuples;	}}void GenerateCandidates_(E2Costs& e2costs,const vPPtr& nextP,GCData& data) {	Len2Cands len2cands;	// generate candidates for each element of nextP:	for(size_t factorType=0;factorType<nextP.size();++factorType) 		if(nextP[factorType]) 			GeneratePerFactorTgtList(factorType,nextP[factorType],data,len2cands);	// for each length: enumerate tuples, compute score, and insert in e2costs	for(size_t len=0;len<len2cands.size();++len) if(len2cands[len]) 		GenerateTupleTgtCands(*len2cands[len],e2costs,data);}void GenerateCandidates(const ConfusionNet& src,												const std::vector<PhraseDictionaryTree const*>& pdicts,												const std::vector<std::vector<float> >& weights,												int verbose) {	GCData data(pdicts,weights);	std::vector<State> stack;	for(size_t i=0;i<src.GetSize();++i) stack.push_back(State(i,i,data.GetRoot()));	std::map<WordsRange,E2Costs> cov2E;	//	std::cerr<<"start while loop. initial stack size: "<<stack.size()<<"\n";	while(!stack.empty()) 	{		State curr(stack.back());		stack.pop_back();				//std::cerr<<"processing state "<<curr<<" stack size: "<<stack.size()<<"\n";		assert(curr.end()<src.GetSize());		const ConfusionNet::Column &currCol=src[curr.end()];		for(size_t colidx=0;colidx<currCol.size();++colidx) 		{			const Word& w=currCol[colidx].first;			vPPtr nextP(curr.ptrs);			for(size_t j=0;j<nextP.size();++j)				nextP[j]=pdicts[j]->Extend(nextP[j],																	 w.GetFactor(data.InFT(j))->GetString());				bool valid=1;			for(size_t j=0;j<nextP.size();++j) if(!nextP[j]) {valid=0;break;}			if(valid) 			{				if(curr.end()+1<src.GetSize())					stack.push_back(State(curr.begin(),curr.end()+1,nextP,																curr.GetScore()+currCol[colidx].second));				E2Costs &e2costs=cov2E[WordsRange(curr.begin(),curr.end()+1)];				GenerateCandidates_(e2costs,nextP,data);			}		}					// check if there are translations of one-word phrases ...		//if(curr.begin()==curr.end() && tCand.empty()) {}			} // end while(!stack.empty()) 	if(verbose) {		// print statistics for debugging purposes		std::cerr<<"tuple stats:  total: "<<data.totalTuples						 <<" distinct: "<<data.distinctTuples<<" ("						 <<(data.distinctTuples/(0.01*data.totalTuples))						 <<"%)\n";		std::cerr<<"per coverage set:\n";		for(std::map<WordsRange,E2Costs>::const_iterator i=cov2E.begin();				i!=cov2E.end();++i) {			std::cerr<<i->first<<" -- distinct cands: "							 <<i->second.size()<<"\n";		}		std::cerr<<"\n\n";	}	if(verbose>10) {		std::cerr<<"full list:\n";		for(std::map<WordsRange,E2Costs>::const_iterator i=cov2E.begin();				i!=cov2E.end();++i) {			std::cerr<<i->first<<" -- distinct cands: "							 <<i->second.size()<<"\n";			for(E2Costs::const_iterator j=i->second.begin();j!=i->second.end();++j)				std::cerr<<j->first<<" -- "<<j->second<<"\n";		}	}}#elsevoid GenerateCandidates(const ConfusionNet&,												const std::vector<PhraseDictionaryTree const*>&,												const std::vector<std::vector<float> >&,												int) {	std::cerr<<"ERROR: GenerateCandidates is currently broken\n";}#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -