📄 lmtable.cpp.svn-base

📁 解码器是基于短语的统计机器翻译系统的核心模块
💻 SVN-BASE
📖 第 1 页 / 共 2 页
字号:
12 下一页
/******************************************************************************IrstLM: IRST Language Model ToolkitCopyright (C) 2006 Marcello Federico, ITC-irst Trento, ItalyThis library is free software; you can redistribute it and/ormodify it under the terms of the GNU Lesser General PublicLicense as published by the Free Software Foundation; eitherversion 2.1 of the License, or (at your option) any later version.This library is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNULesser General Public License for more details.You should have received a copy of the GNU Lesser General PublicLicense along with this library; if not, write to the Free SoftwareFoundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA******************************************************************************/#include <fcntl.h>#include <iostream>#include <fstream>#include <stdexcept>#include <cassert>#include "math.h"#include "mempool.h"#include "htable.h"#include "ngramcache.h"#include "dictionary.h"#include "n_gram.h"#include "lmtable.h"#include "util.h"#define DEBUG 0using namespace std;inline void error(char* message){  std::cerr << message << "\n";  throw std::runtime_error(message);}//instantiate an empty lm tablelmtable::lmtable(){    configure(1,false);    dict=new dictionary((char *)NULL,1000000,(char*)NULL,(char*)NULL);    memset(cursize, 0, sizeof(cursize));	memset(tbltype, 0, sizeof(tbltype));	memset(maxsize, 0, sizeof(maxsize));	memset(info, 0, sizeof(info));	memset(NumCenters, 0, sizeof(NumCenters));   max_cache_lev=0;  for (int i=0;i<=LMTMAXLEV+1;i++) lmtcache[i]=NULL;    probcache=NULL;  statecache=NULL;    memmap=0;  //statistics  for (int i=0;i<=LMTMAXLEV+1;i++) totget[i]=totbsearch[i]=0;}; //loadstd::istream& inp a lmtable from a lm filevoid lmtable::load(istream& inp,const char* filename,int keep_on_disk){#ifdef WIN32  if (keep_on_disk>0){    std::cerr << "lmtable::load memory mapping not yet available under WIN32\n";		keep_on_disk = 0;  }#endif  //give a look at the header to select loading method  char header[1024];	  inp >> header; cerr << header << "\n";    if (strncmp(header,"Qblmt",5)==0 || strncmp(header,"blmt",4)==0){            loadbin(inp,header,filename,keep_on_disk);  }  else{     if (keep_on_disk>0)       std::cerr << "Memory Mapping not available for text LM\n";    loadtxt(inp,header);  }  dict->genoovcode();    	  cerr << "OOV code is " << dict->oovcode() << "\n";  }int parseWords(char *sentence, char **words, int max){  char *word;  int i = 0;    char *const wordSeparators = " \t\r\n";    for (word = strtok(sentence, wordSeparators);       i < max && word != 0;       i++, word = strtok(0, wordSeparators))  {    words[i] = word;  }		  if (i < max){words[i] = 0;}    return i;}//Load a LM as a text file. LM could have been generated either with the //IRST LM toolkit or with the SRILM Toolkit. In the latter we are not //sure that n-grams are lexically ordered (according to the 1-grams).//However, we make the following assumption: //"all successors of any prefix are sorted and written in contiguous lines!"//This method also loads files processed with the quantization //tool: qlmint parseline(istream& inp, int Order,ngram& ng,float& prob,float& bow){	  char* words[1+ LMTMAXLEV + 1 + 1];  int howmany;		  char line[MAX_LINE];    inp.getline(line,MAX_LINE);    if (strlen(line)==MAX_LINE-1){      cerr << "parseline: input line exceed MAXLINE ("       << MAX_LINE << ") chars " << line << "\n";    exit(1);  }    howmany = parseWords(line, words, Order + 3);  assert(howmany == (Order+ 1) || howmany == (Order + 2));	  //read words  ng.size=0;  for (int i=1;i<=Order;i++)     ng.pushw(strcmp(words[i],"<unk>")?words[i]:ng.dict->OOV());    //read logprob/code and logbow/code  assert(sscanf(words[0],"%f",&prob));  if (howmany==(Order+2))    assert(sscanf(words[Order+1],"%f",&bow));  else    bow=0.0; //this is log10prob=0 for implicit backoff		    return 1;}void lmtable::loadcenters(istream& inp,int Order){  char line[MAX_LINE];    //first read the coodebook  cerr << Order << " read code book ";  inp >> NumCenters[Order];  Pcenters[Order]=new float[NumCenters[Order]];  Bcenters[Order]=(Order<maxlev?new float[NumCenters[Order]]:NULL);	  for (int c=0;c<NumCenters[Order];c++){    inp >> Pcenters[Order][c];    if (Order<maxlev) inp >> Bcenters[Order][c];  };    //empty the last line    inp.getline((char*)line,MAX_LINE);  }void lmtable::loadtxt(istream& inp,const char* header){      //open input stream and prepare an input string  char line[MAX_LINE];    //prepare word dictionary  //dict=(dictionary*) new dictionary(NULL,1000000,NULL,NULL);   dict->incflag(1);	  //put here ngrams, log10 probabilities or their codes  ngram ng(dict);   float prob,bow;;    //check the header to decide if the LM is quantized or not  isQtable=(strncmp(header,"qARPA",5)==0?true:false);	  //we will configure the table later we we know the maxlev;  bool yetconfigured=false;	  cerr << "loadtxt()\n"; 	  // READ ARPA Header  int Order,n;    while (inp.getline(line,MAX_LINE)){		    if (strlen(line)==MAX_LINE-1){      cerr << "lmtable::loadtxt: input line exceed MAXLINE ("       << MAX_LINE << ") chars " << line << "\n";      exit(1);    }        bool backslash = (line[0] == '\\');        if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) {      maxsize[Order] = n; maxlev=Order; //upadte Order          }		    if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) {			      //at this point we are sure about the size of the LM      if (!yetconfigured){        configure(maxlev,isQtable);yetconfigured=true;        //allocate space for loading the table of this level        for (int i=1;i<=maxlev;i++)          table[i]= new char[maxsize[i] * nodesize(tbltype[i])];                          }			             cerr << Order << "-grams: reading ";			      if (isQtable) loadcenters(inp,Order);									      //allocate support vector to manage badly ordered n-grams      if (maxlev>1 && Order<maxlev) {        startpos[Order]=new int[maxsize[Order]];        for (int c=0;c<maxsize[Order];c++) startpos[Order][c]=-1;      }			      //prepare to read the n-grams entries      cerr << maxsize[Order] << " entries\n";            //WE ASSUME A WELL STRUCTURED FILE!!!            for (int c=0;c<maxsize[Order];c++){				        if (parseline(inp,Order,ng,prob,bow))          add(ng,              (int)(isQtable?prob:*((int *)&prob)),              (int)(isQtable?bow:*((int *)&bow)));      }      // now we can fix table at level Order -1      if (maxlev>1 && Order>1) checkbounds(Order-1);			    }  }	  dict->incflag(0);    cerr << "done\n";	}//Checkbound with sorting of n-gram table on diskvoid lmtable::checkbounds(int level){    char*  tbl=table[level];  char*  succtbl=table[level+1];	  LMT_TYPE ndt=tbltype[level], succndt=tbltype[level+1];  int ndsz=nodesize(ndt), succndsz=nodesize(succndt);	   //re-order table at level+1 on disk  //generate random filename to avoid collisions   ofstream out;string filePath;  createtempfile(out,filePath,ios::out);   int start,end,newstart;	  //re-order table at level l+1  newstart=0;  for (int c=0;c<cursize[level];c++){    start=startpos[level][c]; end=bound(tbl+c*ndsz,ndt);    //is start==-1 there are no successors for this entry and end==-2    if (end==-2) end=start;    assert(start<=end);    assert(newstart+(end-start)<=cursize[level+1]);    assert(end<=cursize[level+1]);		    if (start<end)      out.write((char*)(succtbl + start * succndsz),(end-start) * succndsz);          bound(tbl+c*ndsz,ndt,newstart+(end-start));    newstart+=(end-start);  }  out.close();	fstream inp(filePath.c_str(),ios::in);    inp.read(succtbl,cursize[level+1]*succndsz);  inp.close();    removefile(filePath);}//Add method inserts n-grams in the table structure. It is ONLY used during //loading of LMs in text format. It searches for the prefix, then it adds the //suffix to the last level and updates the start-end positions. int lmtable::add(ngram& ng,int iprob,int ibow){	  char *found; LMT_TYPE ndt; int ndsz;      if (ng.size>1){		    // find the prefix starting from the first level    int start=0, end=cursize[1]; 		    for (int l=1;l<ng.size;l++){			      ndt=tbltype[l]; ndsz=nodesize(ndt);			      if (search(l,start,(end-start),ndsz,                 ng.wordp(ng.size-l+1),LMT_FIND, &found)){				        //update start-end positions for next step        if (l< (ng.size-1)){										          //set start position          if (found==table[l]) start=0; //first pos in table          else start=bound(found - ndsz,ndt); //end of previous entry 				            //set end position          end=bound(found,ndt);        }      }      else{        cerr << "warning: missing back-off for ngram " << ng << "\n";        return 0;      }		    }		    // update book keeping information about level ng-size -1.    // if this is the first successor update start position    int position=(found-table[ng.size-1])/ndsz;    if (startpos[ng.size-1][position]==-1)      startpos[ng.size-1][position]=cursize[ng.size];		    //always update ending position	    bound(found,ndt,cursize[ng.size]+1);    //cout << "startpos: " << startpos[ng.size-1][position]     //<< " endpos: " << bound(found,ndt) << "\n";		  }	  // just add at the end of table[ng.size]	  assert(cursize[ng.size]< maxsize[ng.size]); // is there enough space?  ndt=tbltype[ng.size];ndsz=nodesize(ndt);    found=table[ng.size] + (cursize[ng.size] * ndsz);  word(found,*ng.wordp(1));   prob(found,ndt,iprob);  if (ng.size<maxlev){bow(found,ndt,ibow);bound(found,ndt,-2);}	  cursize[ng.size]++;	  return 1;	}void *lmtable::search(int lev,                      int offs,                      int n,                      int sz,                      int *ngp,                      LMT_ACTION action,                      char **found){	  //assume 1-grams is a 1-1 map of the vocabulary  if (lev==1) return *found=(*ngp <n ? table[1] + *ngp * sz:NULL);  //prepare table to be searched with mybserach    char* tb;  tb=table[lev]+(sz * offs);    //prepare search pattern  char w[LMTCODESIZE];putmem(w,ngp[0],0,LMTCODESIZE);	  int idx=0; // index returned by mybsearch  *found=NULL;	//initialize output variable	    totbsearch[lev]++;    switch(action){        case LMT_FIND:			      if (!tb || !mybsearch(tb,n,sz,(unsigned char *)w,&idx)) return NULL;      else        return *found=tb + (idx * sz);    default:      error("lmtable::search: this option is available");  };	  return NULL;}int lmtable::mybsearch(char *ar, int n, int size,                       unsigned char *key, int *idx){    register int low, high;  register unsigned char *p;  register int result;  register int i;    /* return idx with the first position equal or greater than key */    /*   Warning("start bsearch \n"); */    low = 0;high = n; *idx=0;  while (low < high)  {    *idx = (low + high) / 2;    p = (unsigned char *) (ar + (*idx * size));        //comparison    for (i=(LMTCODESIZE-1);i>=0;i--){      result=key[i]-p[i];      if (result) break;    }        if (result < 0)      high = *idx;    else if (result > 0)      low = *idx + 1;    else      return 1;  }    *idx=low;    return 0;  }// saves a LM table in text formatvoid lmtable::savetxt(const char *filename){    fstream out(filename,ios::out);  int l;	  out.precision(7);			    if (isQtable) out << "qARPA\n";	      ngram ng(dict,0);    cerr << "savetxt: " << filename << "\n";    out << "\n\\data\\\n";  for (l=1;l<=maxlev;l++){
12 下一页
💿 文件大小 5827 K
👤 上传用户 lyyfengyutongzh
📂 所属分类多国语言处理
🏷️ 相关标签

#解码器 #机器翻译系统 #核心 #模块
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -