⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 lmtable.h.svn-base

📁 解码器是基于短语的统计机器翻译系统的核心模块
💻 SVN-BASE
字号:
/******************************************************************************IrstLM: IRST Language Model ToolkitCopyright (C) 2006 Marcello Federico, ITC-irst Trento, ItalyThis library is free software; you can redistribute it and/ormodify it under the terms of the GNU Lesser General PublicLicense as published by the Free Software Foundation; eitherversion 2.1 of the License, or (at your option) any later version.This library is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNULesser General Public License for more details.You should have received a copy of the GNU Lesser General PublicLicense along with this library; if not, write to the Free SoftwareFoundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA******************************************************************************/#ifndef MF_LMTABLE_H#define MF_LMTABLE_H#ifndef WIN32#include <sys/types.h>#include <sys/mman.h>#endif#include "util.h"#include "ngramcache.h"#include "dictionary.h"#include "n_gram.h"//#undef TRACE_CACHE#define LMTMAXLEV  20#define MAX_LINE  1024#ifndef  LMTCODESIZE#define  LMTCODESIZE  (int)3#endif#define SHORTSIZE (int)2#define PTRSIZE   (int)sizeof(char *)#define INTSIZE   (int)4#define CHARSIZE  (int)1#define PROBSIZE  (int)4 //use float  #define QPROBSIZE (int)1 #define BOUNDSIZE (int)4#define UNIGRAM_RESOLUTION 10000000.0typedef enum {INTERNAL,QINTERNAL,LEAF,QLEAF} LMT_TYPE;typedef char* node;typedef enum {LMT_FIND,    //!< search: find an entry  LMT_ENTER,   //!< search: enter an entry   LMT_INIT,    //!< scan: start scan  LMT_CONT     //!< scan: continue scan} LMT_ACTION;class lmtable{    char*      table[LMTMAXLEV+1]; //storage of all levels  LMT_TYPE tbltype[LMTMAXLEV+1]; //table type for each levels  int      cursize[LMTMAXLEV+1]; //current size of levels  int      maxsize[LMTMAXLEV+1]; //current size of levels  int*    startpos[LMTMAXLEV+1]; //support vector to store start positions	  int               maxlev; //max level of table  char           info[100]; //information put in the header    //statistics   int    totget[LMTMAXLEV+1];  int    totbsearch[LMTMAXLEV+1];    //probability quantization  bool      isQtable;    int       NumCenters[LMTMAXLEV+1];  float*    Pcenters[LMTMAXLEV+1];  float*    Bcenters[LMTMAXLEV+1];    int     lmt_oov_code;  int     lmt_oov_size;  int    backoff_state;     //improve access speed  ngramcache* lmtcache[LMTMAXLEV+1];	ngramcache* probcache;  ngramcache* statecache;  int max_cache_lev;  //memory map on disk  int memmap;  //level from which n-grams are accessed via mmap  int diskid;  off_t tableOffs[LMTMAXLEV+1];  off_t tableGaps[LMTMAXLEV+1];  public:    #ifdef TRACE_CACHE    std::fstream* cacheout;  int sentence_id;#endif    dictionary     *dict; // dictionary    lmtable();    ~lmtable(){    for (int i=2;i<=LMTMAXLEV;i++)            if (lmtcache[i]){      std::cerr << i <<"-gram cache: "; lmtcache[i]->stat();      delete lmtcache[i];     }        if (probcache){      std::cerr << "Prob Cache: "; probcache->stat();      delete probcache;#if TRACE_CACHE      cacheout->close();      delete cacheout;#endif          }     if (statecache){      std::cerr << "State Cache: "; statecache->stat();      delete statecache;    }             for (int l=1;l<=maxlev;l++){      if (table[l]){           if (memmap)            Munmap(table[l]-tableGaps[l],cursize[l]*nodesize(tbltype[l])+tableGaps[l],0);        else          delete [] table[l];                  }      if (isQtable){        if (Pcenters[l]) delete [] Pcenters[l];				if (l<maxlev)           if (Bcenters[l]) delete [] Bcenters[l];      }    }  }      void init_probcache(){    assert(probcache==NULL);    probcache=new ngramcache(maxlev,sizeof(double),400000);#ifdef TRACE_CACHE    cacheout=new std::fstream("/tmp/tracecache",std::ios::out);    sentence_id=0;#endif   }    void init_statecache(){    assert(statecache==NULL);    statecache=new ngramcache(maxlev-1,sizeof(char *),200000);  }    void init_lmtcaches(int uptolev){    max_cache_lev=uptolev;    for (int i=2;i<=max_cache_lev;i++){    assert(lmtcache[i]==NULL);    lmtcache[i]=new ngramcache(i,sizeof(char *),200000);    }  }    void check_cache_levels(){    if (probcache && probcache->isfull()) probcache->reset(probcache->cursize());    if (statecache && statecache->isfull()) statecache->reset(statecache->cursize());    for (int i=2;i<=max_cache_lev;i++)      if (lmtcache[i]->isfull()) lmtcache[i]->reset(lmtcache[i]->cursize());  };        void reset_caches(){      if (probcache) probcache->reset(400000);      if (statecache) statecache->reset(200000);      for (int i=2;i<=max_cache_lev;i++)        lmtcache[i]->reset(200000);    };            void reset_mmap(){#ifndef WIN32    if (memmap>0 and memmap<=maxlev)      for (int l=memmap;l<=maxlev;l++){        std::cerr << "resetting mmap at level:" << l << "\n";        Munmap(table[l]-tableGaps[l],cursize[l]*nodesize(tbltype[l])+tableGaps[l],0);        table[l]=(char *)MMap(diskid,PROT_READ,                              tableOffs[l], cursize[l]*nodesize(tbltype[l]),                               &tableGaps[l]);        table[l]+=tableGaps[l];      }#endif   }        bool is_probcache_active(){return probcache!=NULL;}  bool is_statecache_active(){return statecache!=NULL;}  bool are_lmtcaches_active(){return lmtcache[2]!=NULL;}    	void configure(int n,bool quantized){		maxlev=n;		if (n==1)			tbltype[1]=(quantized?QLEAF:LEAF);		else{			for (int i=1;i<n;i++) tbltype[i]=(quantized?QINTERNAL:INTERNAL);			tbltype[n]=(quantized?QLEAF:LEAF);    }	};	  int maxlevel(){return maxlev;};	bool isQuantized(){return isQtable;}      void savetxt(const char *filename);  void savebin(const char *filename);  void dumplm(std::fstream& out,ngram ng, int ilev, int elev, int ipos,int epos);    void load(std::istream& inp,const char* filename=NULL,int mmap=0);  void loadtxt(std::istream& inp,const char* header);  void loadbin(std::istream& inp,const char* header,const char* filename=NULL,int mmap=0);    void loadbinheader(std::istream& inp, const char* header);  void loadbincodebook(std::istream& inp,int l);    void filter(const char* lmfile){};  void filter2(const char* lmfile,int buffMb=512){    std::cerr << "function is no more available\n";    exit(0);  };    void loadcenters(std::istream& inp,int Order);	  double lprob(ngram ng);   double clprob(ngram ng);     void *search(int lev,int offs,int n,int sz,int *w,               LMT_ACTION action,char **found=(char **)NULL);    int mybsearch(char *ar, int n, int size, unsigned char *key, int *idx);       int add(ngram& ng,int prob,int bow);  void checkbounds(int level);    int get(ngram& ng){return get(ng,ng.size,ng.size);}  int get(ngram& ng,int n,int lev);    int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev);    const char *maxsuffptr(ngram ong);  const char *cmaxsuffptr(ngram ong);    inline int putmem(char* ptr,int value,int offs,int size){    assert(ptr!=NULL);    for (int i=0;i<size;i++)      ptr[offs+i]=(value >> (8 * i)) & 0xff;    return value;  };    inline int getmem(char* ptr,int* value,int offs,int size){    assert(ptr!=NULL);    *value=ptr[offs] & 0xff;    for (int i=1;i<size;i++)      *value= *value | ( ( ptr[offs+i] & 0xff ) << (8 *i));    return *value;  };      int bo_state(int value=-1){     return (value==-1?backoff_state:backoff_state=value);   };      int nodesize(LMT_TYPE ndt){    switch (ndt){      case INTERNAL:        return LMTCODESIZE + PROBSIZE + PROBSIZE + BOUNDSIZE;      case QINTERNAL:        return LMTCODESIZE + QPROBSIZE + QPROBSIZE + BOUNDSIZE;      case QLEAF:        return LMTCODESIZE + QPROBSIZE;            case LEAF:        return LMTCODESIZE + PROBSIZE;            default:        assert(0);        return 0;    }  }      inline int word(node nd,int value=-1)  {    int offset=0;        if (value==-1)      getmem(nd,&value,offset,LMTCODESIZE);    else      putmem(nd,value,offset,LMTCODESIZE);        return value;  };    inline int prob(node nd,LMT_TYPE ndt, int value=-1)  {    int offs=LMTCODESIZE;    int size=(ndt==QINTERNAL || ndt==QLEAF?QPROBSIZE:PROBSIZE);        if (value==-1)      getmem(nd,&value,offs,size);    else      putmem(nd,value,offs,size);        return value;  };      inline int bow(node nd,LMT_TYPE ndt, int value=-1)  {    assert(ndt==INTERNAL || ndt==QINTERNAL);    int size=(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);    int offs=LMTCODESIZE+size;        if (value==-1)      getmem(nd,&value,offs,size);    else      putmem(nd,value,offs,size);        return value;  };    inline int bound(node nd,LMT_TYPE ndt, int value=-1)  {    assert(ndt==INTERNAL || ndt==QINTERNAL);    int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);        if (value==-1)      getmem(nd,&value,offs,BOUNDSIZE);    else      putmem(nd,value,offs,BOUNDSIZE);        return value;  };    void stat(int lev=0);  };#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -