📄 lmtable.cpp.svn-base
字号:
out << "ngram " << l << "= " << cursize[l] << "\n"; } for (l=1;l<=maxlev;l++){ out << "\n\\" << l << "-grams:\n"; cerr << "save: " << cursize[l] << " " << l << "-grams\n"; if (isQtable){ out << NumCenters[l] << "\n"; for (int c=0;c<NumCenters[l];c++){ out << Pcenters[l][c]; if (l<maxlev) out << " " << Bcenters[l][c]; out << "\n"; } } ng.size=0; dumplm(out,ng,1,l,0,cursize[1]); } out << "\\end\\\n"; cerr << "done\n";}void lmtable::savebin(const char *filename){ fstream out(filename,ios::out); cerr << "savebin: " << filename << "\n"; // print header if (isQtable){ out << "Qblmt " << maxlev; for (int i=1;i<=maxlev;i++) out << " " << cursize[i]; out << "\nNumCenters"; for (int i=1;i<=maxlev;i++) out << " " << NumCenters[i]; out << "\n"; }else{ out << "blmt " << maxlev; for (int i=1;i<=maxlev;i++) out << " " << cursize[i] ; out << "\n"; } dict->save(out); for (int i=1;i<=maxlev;i++){ cerr << "saving " << cursize[i] << " " << i << "-grams\n"; if (isQtable){ out.write((char*)Pcenters[i],NumCenters[i] * sizeof(float)); if (i<maxlev) out.write((char *)Bcenters[i],NumCenters[i] * sizeof(float)); } out.write(table[i],cursize[i]*nodesize(tbltype[i])); } cerr << "done\n";}//manages the long header of a bin file//and allocates table for each n-gram levelvoid lmtable::loadbinheader(istream& inp,const char* header){ // read rest of header inp >> maxlev; if (strncmp(header,"Qblmt",5)==0) isQtable=1; else if(strncmp(header,"blmt",4)==0) isQtable=0; else error("loadbin: LM file is not in binary format"); configure(maxlev,isQtable); for (int l=1;l<=maxlev;l++){ inp >> cursize[l]; maxsize[l]=cursize[l]; } if (isQtable){ char header2[100]; inp >> header2; for (int i=1;i<=maxlev;i++){ inp >> NumCenters[i]; cerr << "reading " << NumCenters[i] << " centers\n"; } } }//load codebook of level lvoid lmtable::loadbincodebook(istream& inp,int l){ Pcenters[l]=new float [NumCenters[l]]; inp.read((char*)Pcenters[l],NumCenters[l] * sizeof(float)); if (l<maxlev){ Bcenters[l]=new float [NumCenters[l]]; inp.read((char *)Bcenters[l],NumCenters[l]*sizeof(float)); } } //load a binary lmfilevoid lmtable::loadbin(istream& inp, const char* header,const char* filename,int mmap){ cerr << "loadbin()\n"; loadbinheader(inp,header); dict->load(inp); //if MMAP is used, then open the file if (filename && mmap>0){ #ifdef WIN32 error("lmtable::loadbin mmap facility not yet supported under WIN32\n");#else if (mmap <= maxlev) memmap=mmap; else error("keep_on_disk value is out of range\n"); if ((diskid=open(filename, O_RDONLY))<0){ std::cerr << "cannot open " << filename << "\n"; error("dying"); } //check that the LM is uncompressed char miniheader[4]; read(diskid,miniheader,4); if (strncmp(miniheader,"Qblm",4) && strncmp(miniheader,"blmt",4)) error("mmap functionality does not work with compressed binary LMs\n"); #endif } for (int l=1;l<=maxlev;l++){ if (isQtable) loadbincodebook(inp,l); if ((memmap == 0) || (l < memmap)){ cerr << "loading " << cursize[l] << " " << l << "-grams\n"; table[l]=new char[cursize[l] * nodesize(tbltype[l])]; inp.read(table[l],cursize[l] * nodesize(tbltype[l])); } else{ #ifdef WIN32 error("mmap not available under WIN32\n");#else cerr << "mapping " << cursize[l] << " " << l << "-grams\n"; tableOffs[l]=inp.tellg(); table[l]=(char *)MMap(diskid,PROT_READ, tableOffs[l], cursize[l]*nodesize(tbltype[l]), &tableGaps[l]); table[l]+=tableGaps[l]; inp.seekg(cursize[l]*nodesize(tbltype[l]),ios_base::cur);#endif } }; cerr << "done\n"; }int lmtable::get(ngram& ng,int n,int lev){ // cout << "cerco:" << ng << "\n"; totget[lev]++; if (lev > maxlev) error("get: lev exceeds maxlevel"); if (n < lev) error("get: ngram is too small"); //set boudaries for 1-gram int offset=0,limit=cursize[1]; //information of table entries int hit;char* found; LMT_TYPE ndt; ng.link=NULL; ng.lev=0; for (int l=1;l<=lev;l++){ //initialize entry information hit = 0 ; found = NULL; ndt=tbltype[l]; if (lmtcache[l] && lmtcache[l]->get(ng.wordp(n),(char *)&found)) hit=1; else search(l, offset, (limit-offset), nodesize(ndt), ng.wordp(n-l+1), LMT_FIND, &found); //insert both found and not found items!!! if (lmtcache[l] && hit==0) lmtcache[l]->add(ng.wordp(n),(char *)&found); if (!found) return 0; ng.bow=(l<maxlev?bow(found,ndt):0); ng.prob=prob(found,ndt); ng.link=found; ng.info=ndt; ng.lev=l; if (l<maxlev){ //set start/end point for next search //if current offset is at the bottom also that of successors will be if (offset+1==cursize[l]) limit=cursize[l+1]; else limit=bound(found,ndt); //if current start is at the begin, then also that of successors will be if (found==table[l]) offset=0; else offset=bound((found - nodesize(ndt)),ndt); assert(offset!=-1); assert(limit!=-1); } } //put information inside ng ng.size=n; ng.freq=0; ng.succ=(lev<maxlev?limit-offset:0); #ifdef TRACE_CACHE if (ng.size==maxlev && sentence_id>0){ *cacheout << sentence_id << " miss " << ng << " " << (unsigned int) ng.link << "\n"; }#endif return 1;}//recursively prints the language model tablevoid lmtable::dumplm(fstream& out,ngram ng, int ilev, int elev, int ipos,int epos){ LMT_TYPE ndt=tbltype[ilev]; int ndsz=nodesize(ndt); assert(ng.size==ilev-1); assert(ipos>=0 && epos<=cursize[ilev] && ipos<epos); ng.pushc(0); for (int i=ipos;i<epos;i++){ *ng.wordp(1)=word(table[ilev]+i*ndsz); if (ilev<elev){ //get first and last successor position int isucc=(i>0?bound(table[ilev]+(i-1)*ndsz,ndt):0); int esucc=bound(table[ilev]+i*ndsz,ndt); if (isucc < esucc) //there are successors! dumplm(out,ng,ilev+1,elev,isucc,esucc); //else //cout << "no successors for " << ng << "\n"; } else{ //out << i << " "; //this was just to count printed n-grams int ipr=prob(table[ilev]+ i * ndsz,ndt); out << (isQtable?ipr:*(float *)&ipr) <<"\t"; for (int k=ng.size;k>=1;k--){ if (k<ng.size) out << " "; out << dict->decode(*ng.wordp(k)); } if (ilev<maxlev){ int ibo=bow(table[ilev]+ i * ndsz,ndt); if (isQtable) out << "\t" << ibo; else if (*((float *)&ibo)!=0.0) out << "\t" << *((float *)&ibo); } out << "\n"; } }}//succscan iteratively returns all successors of an ngram h for which //get(h,h.size,h.size) returned true. int lmtable::succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev){ assert(lev==h.lev+1 && h.size==lev && lev<=maxlev); LMT_TYPE ndt=tbltype[h.lev]; int ndsz=nodesize(ndt); switch (action){ case LMT_INIT: //reset ngram local indexes ng.size=lev; ng.trans(h); ng.midx[lev]=(h.link>table[h.lev]?bound(h.link-ndsz,ndt):0); return 1; case LMT_CONT: if (ng.midx[lev]<bound(h.link,ndt)) { //put current word into ng *ng.wordp(1)=word(table[lev]+ng.midx[lev]*nodesize(tbltype[lev])); ng.midx[lev]++; return 1; } else return 0; default: cerr << "succscan: only permitted options are LMT_INIT and LMT_CONT\n"; exit(0); } }//maxsuffptr returns the largest suffix of an n-gram that is contained //in the LM table. This can be used as a compact representation of the //(n-1)-gram state of a n-gram LM. if the input k-gram has k>=n then it //is trimmed to its n-1 suffix.const char *lmtable::maxsuffptr(ngram ong){ if (ong.size==0) return (char*) NULL; if (ong.size>=maxlev) ong.size=maxlev-1; ngram ng=ong; //ngram ng(dict); //eventually use the <unk> word //ng.trans(ong); if (get(ng,ng.size,ng.size)) return ng.link; else{ ong.size--; return maxsuffptr(ong); }}const char *lmtable::cmaxsuffptr(ngram ong){ if (ong.size==0) return (char*) NULL; if (ong.size>=maxlev) ong.size=maxlev-1; char* found; if (statecache && (ong.size==maxlev-1) && statecache->get(ong.wordp(maxlev-1),(char *)&found)) return found; found=(char *)maxsuffptr(ong); if (statecache && ong.size==maxlev-1){ //if (statecache->isfull()) statecache->reset(); statecache->add(ong.wordp(maxlev-1),(char *)&found); }; return found; }//return log10 probsdouble lmtable::lprob(ngram ong){ if (ong.size==0) return 0.0; if (ong.size>maxlev) ong.size=maxlev; ngram ng=ong; //ngram ng(dict); //avoid dictionary transfer //ng.trans(ong); double rbow; int ibow,iprob; if (get(ng,ng.size,ng.size)){ iprob=ng.prob; return (double)(isQtable?Pcenters[ng.size][iprob]:*((float *)&iprob)); } else{ //size==1 means an OOV word if (ng.size==1) return -log(UNIGRAM_RESOLUTION)/log(10.0); else{ // compute backoff //set backoff state, shift n-gram, set default bow prob bo_state(1); ng.shift();rbow=0.0; if (ng.lev==ng.size){ ibow=ng.bow; rbow= (double) (isQtable?Bcenters[ng.size][ibow]:*((float *)&ibow)); } //prepare recursion step ong.size--; return rbow + lprob(ong); } }}//return log10 probsL use cache memorydouble lmtable::clprob(ngram ong){ if (ong.size==0) return 0.0; if (ong.size>maxlev) ong.size=maxlev; double logpr; #ifdef TRACE_CACHE if (probcache && ong.size==maxlev && sentence_id>0){ *cacheout << sentence_id << " " << ong << "\n"; }#endif //cache hit if (probcache && ong.size==maxlev && probcache->get(ong.wordp(maxlev),(char *)&logpr)){ return logpr; } //cache miss logpr=lprob(ong); if (probcache && ong.size==maxlev){ probcache->add(ong.wordp(maxlev),(char *)&logpr); }; return logpr;};void lmtable::stat(int level){ int totmem=0,memory; float mega=1024 * 1024; cout.precision(2); cout << "lmtable class statistics\n"; cout << "levels " << maxlev << "\n"; for (int l=1;l<=maxlev;l++){ memory=cursize[l] * nodesize(tbltype[l]); cout << "lev " << l << " entries "<< cursize[l] << " used mem " << memory/mega << "Mb\n"; totmem+=memory; } cout << "total allocated mem " << totmem/mega << "Mb\n"; cout << "total number of get and binary search calls\n"; for (int l=1;l<=maxlev;l++){ cout << "level " << l << " get: " << totget[l] << " bsearch: " << totbsearch[l] << "\n"; } if (level >1 ) dict->stat(); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -