📄 phrasedictionarytree.cpp.svn-base
字号:
// $Id$// vim:tabstop=2#include "PhraseDictionaryTree.h"#include <map>#include <cassert>#include <sstream>#include <iostream>#include <fstream>#include <string>#include "PrefixTree.h"#include "File.h"#include "ObjectPool.h"#include "LVoc.h"#include "TypeDef.h"#include "Util.h"template<typename T>std::ostream& operator<<(std::ostream& out,const std::vector<T>& x){ out<<x.size()<<" "; typename std::vector<T>::const_iterator iend=x.end(); for(typename std::vector<T>::const_iterator i=x.begin();i!=iend;++i) out<<*i<<' '; return out;}typedef std::vector<float> Scores;typedef PrefixTreeF<LabelId,OFF_T> PTF;class TgtCand { IPhrase e; Scores sc;public: TgtCand() {} TgtCand(const IPhrase& a,const Scores& b) : e(a),sc(b) {} TgtCand(FILE* f) {readBin(f);} const IPhrase& GetPhrase() const {return e;} const Scores& GetScores() const {return sc;} void writeBin(FILE* f) const {fWriteVector(f,e);fWriteVector(f,sc);} void readBin(FILE* f) {fReadVector(f,e);fReadVector(f,sc);} };class TgtCands : public std::vector<TgtCand> { typedef std::vector<TgtCand> MyBase;public: TgtCands() : MyBase() {} void writeBin(FILE* f) const { unsigned s=size();fWrite(f,s); for(size_t i=0;i<s;++i) MyBase::operator[](i).writeBin(f); } void readBin(FILE* f) { unsigned s;fRead(f,s);resize(s); for(size_t i=0;i<s;++i) MyBase::operator[](i).readBin(f); }};struct PPimp { PTF const*p;unsigned idx;bool root; PPimp(PTF const* x,unsigned i,bool b) : p(x),idx(i),root(b) {} bool isValid() const {return root || (p && idx<p->size());} bool isRoot() const {return root;} PTF const* ptr() const {return p;}};PhraseDictionaryTree::PrefixPtr::operator bool() const { return imp && imp->isValid();}struct PDTimp { typedef PrefixTreeF<LabelId,OFF_T> PTF; typedef FilePtr<PTF> CPT; typedef std::vector<CPT> Data; typedef LVoc<std::string> WordVoc; Data data; std::vector<OFF_T> srcOffsets; FILE *os,*ot; WordVoc sv,tv; ObjectPool<PPimp> pPool; // a comparison with the Boost MemPools might be useful PDTimp() : os(0),ot(0) {PTF::setDefault(InvalidOffT);} ~PDTimp() {if(os) fClose(os);if(ot) fClose(ot);FreeMemory();} void FreeMemory() { for(Data::iterator i=data.begin();i!=data.end();++i) (*i).free(); pPool.reset(); } int Read(const std::string& fn); void GetTargetCandidates(const IPhrase& f,TgtCands& tgtCands) { if(f.empty()) return; if(f[0]>=data.size()) return; if(!data[f[0]]) return; assert(data[f[0]]->findKey(f[0])<data[f[0]]->size()); OFF_T tCandOffset=data[f[0]]->find(f); if(tCandOffset==InvalidOffT) return; fSeek(ot,tCandOffset); tgtCands.readBin(ot); } typedef PhraseDictionaryTree::PrefixPtr PPtr; void GetTargetCandidates(PPtr p,TgtCands& tgtCands) { assert(p); if(p.imp->isRoot()) return; OFF_T tCandOffset=p.imp->ptr()->getData(p.imp->idx); if(tCandOffset==InvalidOffT) return; fSeek(ot,tCandOffset); tgtCands.readBin(ot); } void PrintTgtCand(const TgtCands& tcands,std::ostream& out) const; // convert target candidates from internal data structure to the external one void ConvertTgtCand(const TgtCands& tcands,std::vector<StringTgtCand>& rv) const { for(TgtCands::const_iterator i=tcands.begin();i!=tcands.end();++i) { const IPhrase& iphrase=i->GetPhrase(); std::vector<std::string const*> vs; vs.reserve(iphrase.size()); for(size_t j=0;j<iphrase.size();++j) vs.push_back(&tv.symbol(iphrase[j])); rv.push_back(StringTgtCand(vs,i->GetScores())); } } PPtr GetRoot() { return PPtr(pPool.get(PPimp(0,0,1))); } PPtr Extend(PPtr p,const std::string& w) { assert(p); if(w.empty() || w==EPSILON) return p; LabelId wi=sv.index(w); if(wi==InvalidLabelId) return PPtr(); // unknown word else if(p.imp->isRoot()) { if(wi<data.size() && data[wi]) { assert(data[wi]->findKeyPtr(wi)); return PPtr(pPool.get(PPimp(data[wi],data[wi]->findKey(wi),0))); } } else if(PTF const* nextP=p.imp->ptr()->getPtr(p.imp->idx)) { return PPtr(pPool.get(PPimp(nextP,nextP->findKey(wi),0))); } return PPtr(); }};//////////////////////////////////////////////////////////////// member functions of PDTimp//////////////////////////////////////////////////////////////int PDTimp::Read(const std::string& fn) { std::string ifs(fn+".binphr.srctree"), ift(fn+".binphr.tgtdata"), ifi(fn+".binphr.idx"), ifsv(fn+".binphr.srcvoc"), iftv(fn+".binphr.tgtvoc"); FILE *ii=fOpen(ifi.c_str(),"rb"); fReadVector(ii,srcOffsets); fClose(ii); os=fOpen(ifs.c_str(),"rb"); ot=fOpen(ift.c_str(),"rb"); data.resize(srcOffsets.size()); for(size_t i=0;i<data.size();++i) data[i]=CPT(os,srcOffsets[i]); sv.Read(ifsv); tv.Read(iftv); TRACE_ERR("binary phrasefile loaded, default OFF_T: "<<PTF::getDefault() <<"\n"); return 1;}void PDTimp::PrintTgtCand(const TgtCands& tcand,std::ostream& out) const{ for(size_t i=0;i<tcand.size();++i) { out<<i<<" -- "<<tcand[i].GetScores()<<" -- "; const IPhrase& iphr=tcand[i].GetPhrase(); for(size_t j=0;j<iphr.size();++j) out<<tv.symbol(iphr[j])<<" "; out<<'\n'; }}//////////////////////////////////////////////////////////////// member functions of PhraseDictionaryTree//////////////////////////////////////////////////////////////PhraseDictionaryTree::PhraseDictionaryTree(size_t numScoreComponent) : Dictionary(numScoreComponent),imp(new PDTimp){ if(sizeof(OFF_T)!=8) { TRACE_ERR("ERROR: size of type 'OFF_T' has to be 64 bit!\n" "In gcc, use compiler settings '-D_FILE_OFFSET_BITS=64 -D_LARGE_FILES'\n" " -> abort \n\n"); abort(); }}PhraseDictionaryTree::~PhraseDictionaryTree() { delete imp;}void PhraseDictionaryTree::FreeMemory() const{ imp->FreeMemory();}void PhraseDictionaryTree::GetTargetCandidates(const std::vector<std::string>& src, std::vector<StringTgtCand>& rv) const { IPhrase f(src.size()); for(size_t i=0;i<src.size();++i) { f[i]=imp->sv.index(src[i]); if(f[i]==InvalidLabelId) return; } TgtCands tgtCands; imp->GetTargetCandidates(f,tgtCands); imp->ConvertTgtCand(tgtCands,rv);}void PhraseDictionaryTree::PrintTargetCandidates(const std::vector<std::string>& src, std::ostream& out) const { IPhrase f(src.size()); for(size_t i=0;i<src.size();++i) { f[i]=imp->sv.index(src[i]); if(f[i]==InvalidLabelId) { TRACE_ERR("the source phrase '"<<src<<"' contains an unknown word '" <<src[i]<<"'\n"); return; } } TgtCands tcand; imp->GetTargetCandidates(f,tcand); out<<"there are "<<tcand.size()<<" target candidates\n"; imp->PrintTgtCand(tcand,out);}int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out) { std::string line; size_t count = 0; std::string ofn(out+".binphr.srctree"), oft(out+".binphr.tgtdata"), ofi(out+".binphr.idx"), ofsv(out+".binphr.srcvoc"), oftv(out+".binphr.tgtvoc"); FILE *os=fOpen(ofn.c_str(),"wb"), *ot=fOpen(oft.c_str(),"wb"); typedef PrefixTreeSA<LabelId,OFF_T> PSA; PSA *psa=new PSA;PSA::setDefault(InvalidOffT); LabelId currFirstWord=InvalidLabelId; IPhrase currF; TgtCands tgtCands; std::vector<OFF_T> vo; size_t lnc=0; size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info while(getline(inFile, line)) { ++lnc; std::vector<std::string> tokens = TokenizeMultiCharSeparator( line , "|||" ); if (numElement == NOT_FOUND) { // init numElement numElement = tokens.size(); assert(numElement == 3 || numElement == 5); } else if (tokens.size() != numElement) { std::stringstream strme; strme << "Syntax error at line " << lnc << " : " << line; UserMessage::Add(strme.str()); abort(); } IPhrase f,e;Scores sc; std::vector<std::string> wordVec = Tokenize(tokens[0]); for (size_t i = 0 ; i < wordVec.size() ; ++i) f.push_back(imp->sv.add(wordVec[i])); wordVec = Tokenize(tokens[1]); for (size_t i = 0 ; i < wordVec.size() ; ++i) e.push_back(imp->tv.add(wordVec[i])); // while(is>>w && w!="|||") sc.push_back(atof(w.c_str())); // Mauro: to handle 0 probs in phrase tables std::vector<float> scoreVector = Tokenize<float>(tokens[(numElement==3) ? 2 : 4]); for (size_t i = 0 ; i < scoreVector.size() ; ++i) { float tmp = scoreVector[i]; sc.push_back(((tmp>0.0)?tmp:(float)1.0e-38)); } if(f.empty()) { TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n"); continue; } if(currFirstWord==InvalidLabelId) currFirstWord=f[0]; if(currF.empty()) { ++count; currF=f; // insert src phrase in prefix tree assert(psa); PSA::Data& d=psa->insert(f); if(d==InvalidOffT) d=fTell(ot); else { TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" <<line<<"'\nf: "<<f<<"\n"); abort(); } } if(currF!=f) { // new src phrase currF=f; tgtCands.writeBin(ot); tgtCands.clear(); if(++count%10000==0) { TRACE_ERR("."); if(count%500000==0) TRACE_ERR("[phrase:"<<count<<"]\n"); } if(f[0]!=currFirstWord) { // write src prefix tree to file and clear PTF pf; if(currFirstWord>=vo.size()) vo.resize(currFirstWord+1,InvalidOffT); vo[currFirstWord]=fTell(os); pf.create(*psa,os); // clear delete psa;psa=new PSA; currFirstWord=f[0]; } // insert src phrase in prefix tree assert(psa); PSA::Data& d=psa->insert(f); if(d==InvalidOffT) d=fTell(ot); else { TRACE_ERR("ERROR: xsource phrase already inserted (B)!\nline(" << lnc << "): '" <<line<<"'\nf: "<<f<<"\n"); abort(); } } tgtCands.push_back(TgtCand(e,sc)); assert(currFirstWord!=InvalidLabelId); } tgtCands.writeBin(ot);tgtCands.clear(); PTF pf; if(currFirstWord>=vo.size()) vo.resize(currFirstWord+1,InvalidOffT); vo[currFirstWord]=fTell(os); pf.create(*psa,os); delete psa;psa=0; TRACE_ERR("distinct source phrases: "<<count <<" distinct first words of source phrases: "<<vo.size() <<" number of phrase pairs (line count): "<<lnc <<"\n"); fClose(os); fClose(ot); std::vector<size_t> inv; for(size_t i=0;i<vo.size();++i) if(vo[i]==InvalidOffT) inv.push_back(i); if(inv.size()) { TRACE_ERR("WARNING: there are src voc entries with no phrase " "translation: count "<<inv.size()<<"\n" "There exists phrase translations for "<<vo.size()-inv.size() <<" entries\n"); } FILE *oi=fOpen(ofi.c_str(),"wb"); fWriteVector(oi,vo); fClose(oi); imp->sv.Write(ofsv); imp->tv.Write(oftv); return 1;}int PhraseDictionaryTree::Read(const std::string& fn) { TRACE_ERR("size of OFF_T "<<sizeof(OFF_T)<<"\n"); return imp->Read(fn);} PhraseDictionaryTree::PrefixPtr PhraseDictionaryTree::GetRoot() const { return imp->GetRoot(); }PhraseDictionaryTree::PrefixPtr PhraseDictionaryTree::Extend(PrefixPtr p, const std::string& w) const { return imp->Extend(p,w);}void PhraseDictionaryTree::PrintTargetCandidates(PrefixPtr p,std::ostream& out) const { TgtCands tcand; imp->GetTargetCandidates(p,tcand); out<<"there are "<<tcand.size()<<" target candidates\n"; imp->PrintTgtCand(tcand,out);}void PhraseDictionaryTree::GetTargetCandidates(PrefixPtr p, std::vector<StringTgtCand>& rv) const { TgtCands tcands; imp->GetTargetCandidates(p,tcands); imp->ConvertTgtCand(tcands,rv);}std::string PhraseDictionaryTree::GetScoreProducerDescription() const{ return "Phrase dictionary tree";}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -