lmtable.cpp.svn-base

来自「解码器是基于短语的统计机器翻译系统的核心模块」· SVN-BASE 代码 · 共 929 行 · 第 1/2 页
SVN-BASE
929 行
    out << "ngram " << l << "= " << cursize[l] << "\n";  }    for (l=1;l<=maxlev;l++){        out << "\n\\" << l << "-grams:\n";    cerr << "save: " << cursize[l] << " " << l << "-grams\n";    if (isQtable){      out << NumCenters[l] << "\n";      for (int c=0;c<NumCenters[l];c++){        out << Pcenters[l][c];        if (l<maxlev) out << " " << Bcenters[l][c];        out << "\n";      }    }		    ng.size=0;    dumplm(out,ng,1,l,0,cursize[1]);		  }	  out << "\\end\\\n";    cerr << "done\n";}void lmtable::savebin(const char *filename){	  fstream out(filename,ios::out);    cerr << "savebin: " << filename << "\n";	  // print header  if (isQtable){    out << "Qblmt " << maxlev;    for (int i=1;i<=maxlev;i++) out << " " << cursize[i];    out << "\nNumCenters";    for (int i=1;i<=maxlev;i++)  out << " " << NumCenters[i];    out << "\n";      }else{    out << "blmt " << maxlev;    for (int i=1;i<=maxlev;i++) out << " " << cursize[i] ;    out << "\n";  }	  dict->save(out);    for (int i=1;i<=maxlev;i++){    cerr << "saving " << cursize[i] << " " << i << "-grams\n";    if (isQtable){      out.write((char*)Pcenters[i],NumCenters[i] * sizeof(float));      if (i<maxlev)         out.write((char *)Bcenters[i],NumCenters[i] * sizeof(float));    }    out.write(table[i],cursize[i]*nodesize(tbltype[i]));  }    cerr << "done\n";}//manages the long header of a bin file//and allocates table for each n-gram levelvoid lmtable::loadbinheader(istream& inp,const char* header){    // read rest of header  inp >> maxlev;    if (strncmp(header,"Qblmt",5)==0) isQtable=1;  else if(strncmp(header,"blmt",4)==0) isQtable=0;  else error("loadbin: LM file is not in binary format");	  configure(maxlev,isQtable);    for (int l=1;l<=maxlev;l++){    inp >> cursize[l]; maxsize[l]=cursize[l];     }      if (isQtable){    char header2[100];    inp >> header2;    for (int i=1;i<=maxlev;i++){      inp >> NumCenters[i];      cerr << "reading  " << NumCenters[i] << " centers\n";    }  } }//load codebook of level lvoid lmtable::loadbincodebook(istream& inp,int l){    Pcenters[l]=new float [NumCenters[l]];     inp.read((char*)Pcenters[l],NumCenters[l] * sizeof(float));  if (l<maxlev){     Bcenters[l]=new float [NumCenters[l]];    inp.read((char *)Bcenters[l],NumCenters[l]*sizeof(float));  }  }  //load a binary lmfilevoid lmtable::loadbin(istream& inp, const char* header,const char* filename,int mmap){      cerr << "loadbin()\n";  loadbinheader(inp,header);  dict->load(inp);       //if MMAP is used, then open the file  if (filename && mmap>0){ #ifdef WIN32    error("lmtable::loadbin mmap facility not yet supported under WIN32\n");#else           if (mmap <= maxlev) memmap=mmap;    else error("keep_on_disk value is out of range\n");        if ((diskid=open(filename, O_RDONLY))<0){      std::cerr << "cannot open " << filename << "\n";      error("dying");    }        //check that the LM is uncompressed    char miniheader[4];    read(diskid,miniheader,4);    if (strncmp(miniheader,"Qblm",4) && strncmp(miniheader,"blmt",4))      error("mmap functionality does not work with compressed binary LMs\n");      #endif    }    for (int l=1;l<=maxlev;l++){    if (isQtable) loadbincodebook(inp,l);    if ((memmap == 0) || (l < memmap)){      cerr << "loading " << cursize[l] << " " << l << "-grams\n";      table[l]=new char[cursize[l] * nodesize(tbltype[l])];      inp.read(table[l],cursize[l] * nodesize(tbltype[l]));    }    else{      #ifdef WIN32      error("mmap not available under WIN32\n");#else      cerr << "mapping " << cursize[l] << " " << l << "-grams\n";      tableOffs[l]=inp.tellg();      table[l]=(char *)MMap(diskid,PROT_READ,                            tableOffs[l], cursize[l]*nodesize(tbltype[l]),                    &tableGaps[l]);      table[l]+=tableGaps[l];      inp.seekg(cursize[l]*nodesize(tbltype[l]),ios_base::cur);#endif          }  };      cerr << "done\n";  }int lmtable::get(ngram& ng,int n,int lev){    //  cout << "cerco:" << ng << "\n";  totget[lev]++;    if (lev > maxlev) error("get: lev exceeds maxlevel");  if (n < lev) error("get: ngram is too small");    //set boudaries for 1-gram   int offset=0,limit=cursize[1];	  //information of table entries  int hit;char* found; LMT_TYPE ndt;  ng.link=NULL;  ng.lev=0;              for (int l=1;l<=lev;l++){		    //initialize entry information     hit = 0 ; found = NULL; ndt=tbltype[l];        if (lmtcache[l] && lmtcache[l]->get(ng.wordp(n),(char *)&found))      hit=1;    else      search(l,             offset,             (limit-offset),             nodesize(ndt),             ng.wordp(n-l+1),              LMT_FIND,             &found);            //insert both found and not found items!!!    if (lmtcache[l] && hit==0)      lmtcache[l]->add(ng.wordp(n),(char *)&found);           if (!found) return 0;              ng.bow=(l<maxlev?bow(found,ndt):0);        ng.prob=prob(found,ndt);    ng.link=found;        ng.info=ndt;    ng.lev=l;        if (l<maxlev){ //set start/end point for next search			      //if current offset is at the bottom also that of successors will be      if (offset+1==cursize[l]) limit=cursize[l+1];      else limit=bound(found,ndt);            //if current start is at the begin, then also that of successors will be      if (found==table[l]) offset=0;      else offset=bound((found - nodesize(ndt)),ndt);            assert(offset!=-1); assert(limit!=-1);          }  }    //put information inside ng  ng.size=n;  ng.freq=0;   ng.succ=(lev<maxlev?limit-offset:0);  #ifdef TRACE_CACHE  if (ng.size==maxlev && sentence_id>0){    *cacheout << sentence_id << " miss " << ng << " " << (unsigned int) ng.link << "\n";    }#endif      return 1;}//recursively prints the language model tablevoid lmtable::dumplm(fstream& out,ngram ng, int ilev, int elev, int ipos,int epos){	  LMT_TYPE ndt=tbltype[ilev];  int ndsz=nodesize(ndt);	  assert(ng.size==ilev-1);  assert(ipos>=0 && epos<=cursize[ilev] && ipos<epos);  ng.pushc(0);		  for (int i=ipos;i<epos;i++){    *ng.wordp(1)=word(table[ilev]+i*ndsz);    if (ilev<elev){      //get first and last successor position      int isucc=(i>0?bound(table[ilev]+(i-1)*ndsz,ndt):0);      int esucc=bound(table[ilev]+i*ndsz,ndt);      if (isucc < esucc) //there are successors!        dumplm(out,ng,ilev+1,elev,isucc,esucc);      //else      //cout << "no successors for " << ng << "\n";    }    else{      //out << i << " "; //this was just to count printed n-grams      int ipr=prob(table[ilev]+ i * ndsz,ndt);      out << (isQtable?ipr:*(float *)&ipr) <<"\t";      for (int k=ng.size;k>=1;k--){        if (k<ng.size) out << " ";        out << dict->decode(*ng.wordp(k));				      }                 if (ilev<maxlev){        int ibo=bow(table[ilev]+ i * ndsz,ndt);        if (isQtable) out << "\t" << ibo;        else          if (*((float *)&ibo)!=0.0)             out << "\t" << *((float *)&ibo);             }      out << "\n";				    }  }}//succscan iteratively returns all successors of an ngram h for which //get(h,h.size,h.size) returned true. int lmtable::succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev){  assert(lev==h.lev+1 && h.size==lev && lev<=maxlev);    LMT_TYPE ndt=tbltype[h.lev];  int ndsz=nodesize(ndt);    switch (action){        case LMT_INIT:      //reset ngram local indexes            ng.size=lev;      ng.trans(h);          ng.midx[lev]=(h.link>table[h.lev]?bound(h.link-ndsz,ndt):0);            return 1;          case LMT_CONT:            if (ng.midx[lev]<bound(h.link,ndt))      {        //put current word into ng        *ng.wordp(1)=word(table[lev]+ng.midx[lev]*nodesize(tbltype[lev]));        ng.midx[lev]++;        return 1;      }      else        return 0;          default:      cerr << "succscan: only permitted options are LMT_INIT and LMT_CONT\n";      exit(0);  }  }//maxsuffptr returns the largest suffix of an n-gram that is contained //in the LM table. This can be used as a compact representation of the //(n-1)-gram state of a n-gram LM. if the input k-gram has k>=n then it //is trimmed to its n-1 suffix.const char *lmtable::maxsuffptr(ngram ong){      if (ong.size==0) return (char*) NULL;  if (ong.size>=maxlev) ong.size=maxlev-1;    ngram ng=ong;  //ngram ng(dict); //eventually use the <unk> word  //ng.trans(ong);    if (get(ng,ng.size,ng.size))    return ng.link;  else{     ong.size--;    return maxsuffptr(ong);  }}const char *lmtable::cmaxsuffptr(ngram ong){      if (ong.size==0) return (char*) NULL;  if (ong.size>=maxlev) ong.size=maxlev-1;    char* found;    if (statecache && (ong.size==maxlev-1) && statecache->get(ong.wordp(maxlev-1),(char *)&found))    return found;    found=(char *)maxsuffptr(ong);    if (statecache && ong.size==maxlev-1){    //if (statecache->isfull()) statecache->reset();    statecache->add(ong.wordp(maxlev-1),(char *)&found);      };     return found;  }//return log10 probsdouble lmtable::lprob(ngram ong){	  if (ong.size==0) return 0.0;  if (ong.size>maxlev) ong.size=maxlev;    ngram ng=ong;  //ngram ng(dict); //avoid dictionary transfer  //ng.trans(ong);	  double rbow;  int ibow,iprob;  	  if (get(ng,ng.size,ng.size)){    iprob=ng.prob;		    return (double)(isQtable?Pcenters[ng.size][iprob]:*((float *)&iprob));  }  else{ //size==1 means an OOV word     if (ng.size==1)          return -log(UNIGRAM_RESOLUTION)/log(10.0);    else{ // compute backoff          //set backoff state, shift n-gram, set default bow prob       bo_state(1); ng.shift();rbow=0.0; 			      if (ng.lev==ng.size){         ibow=ng.bow;         rbow= (double) (isQtable?Bcenters[ng.size][ibow]:*((float *)&ibow));      }      //prepare recursion step      ong.size--;            return rbow + lprob(ong);    }  }}//return log10 probsL use cache memorydouble lmtable::clprob(ngram ong){	    if (ong.size==0) return 0.0;    if (ong.size>maxlev) ong.size=maxlev;  double logpr; #ifdef TRACE_CACHE  if (probcache && ong.size==maxlev && sentence_id>0){   *cacheout << sentence_id << " " << ong << "\n";    }#endif      //cache hit  if (probcache && ong.size==maxlev && probcache->get(ong.wordp(maxlev),(char *)&logpr)){            return logpr;     }   //cache miss  logpr=lprob(ong);    if (probcache && ong.size==maxlev){     probcache->add(ong.wordp(maxlev),(char *)&logpr);      };     return logpr;};void lmtable::stat(int level){  int totmem=0,memory;  float mega=1024 * 1024;    cout.precision(2);    cout << "lmtable class statistics\n";    cout << "levels " << maxlev << "\n";  for (int l=1;l<=maxlev;l++){    memory=cursize[l] * nodesize(tbltype[l]);    cout << "lev " << l       << " entries "<< cursize[l]       << " used mem " << memory/mega << "Mb\n";    totmem+=memory;  }    cout << "total allocated mem " << totmem/mega << "Mb\n";    cout << "total number of get and binary search calls\n";  for (int l=1;l<=maxlev;l++){    cout << "level " << l << " get: " << totget[l] << " bsearch: " << totbsearch[l] << "\n";  }    if (level >1 ) dict->stat();  }
lmtable.cpp.svn-base - 源码说明

本页面展示了「解码器是基于短语的统计机器翻译系统的核心模块」中的 lmtable.cpp.svn-base 源码文件，采用 SVN-BASE 编程语言编写，共 929 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与解码器相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?