📄 lmtable.cpp.svn-base
字号:
/******************************************************************************IrstLM: IRST Language Model ToolkitCopyright (C) 2006 Marcello Federico, ITC-irst Trento, ItalyThis library is free software; you can redistribute it and/ormodify it under the terms of the GNU Lesser General PublicLicense as published by the Free Software Foundation; eitherversion 2.1 of the License, or (at your option) any later version.This library is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNULesser General Public License for more details.You should have received a copy of the GNU Lesser General PublicLicense along with this library; if not, write to the Free SoftwareFoundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA******************************************************************************/#include <fcntl.h>#include <iostream>#include <fstream>#include <stdexcept>#include <cassert>#include "math.h"#include "mempool.h"#include "htable.h"#include "ngramcache.h"#include "dictionary.h"#include "n_gram.h"#include "lmtable.h"#include "util.h"#define DEBUG 0using namespace std;inline void error(char* message){ std::cerr << message << "\n"; throw std::runtime_error(message);}//instantiate an empty lm tablelmtable::lmtable(){ configure(1,false); dict=new dictionary((char *)NULL,1000000,(char*)NULL,(char*)NULL); memset(cursize, 0, sizeof(cursize)); memset(tbltype, 0, sizeof(tbltype)); memset(maxsize, 0, sizeof(maxsize)); memset(info, 0, sizeof(info)); memset(NumCenters, 0, sizeof(NumCenters)); max_cache_lev=0; for (int i=0;i<=LMTMAXLEV+1;i++) lmtcache[i]=NULL; probcache=NULL; statecache=NULL; memmap=0; //statistics for (int i=0;i<=LMTMAXLEV+1;i++) totget[i]=totbsearch[i]=0;}; //loadstd::istream& inp a lmtable from a lm filevoid lmtable::load(istream& inp,const char* filename,int keep_on_disk){#ifdef WIN32 if (keep_on_disk>0){ std::cerr << "lmtable::load memory mapping not yet available under WIN32\n"; keep_on_disk = 0; }#endif //give a look at the header to select loading method char header[1024]; inp >> header; cerr << header << "\n"; if (strncmp(header,"Qblmt",5)==0 || strncmp(header,"blmt",4)==0){ loadbin(inp,header,filename,keep_on_disk); } else{ if (keep_on_disk>0) std::cerr << "Memory Mapping not available for text LM\n"; loadtxt(inp,header); } dict->genoovcode(); cerr << "OOV code is " << dict->oovcode() << "\n"; }int parseWords(char *sentence, char **words, int max){ char *word; int i = 0; char *const wordSeparators = " \t\r\n"; for (word = strtok(sentence, wordSeparators); i < max && word != 0; i++, word = strtok(0, wordSeparators)) { words[i] = word; } if (i < max){words[i] = 0;} return i;}//Load a LM as a text file. LM could have been generated either with the //IRST LM toolkit or with the SRILM Toolkit. In the latter we are not //sure that n-grams are lexically ordered (according to the 1-grams).//However, we make the following assumption: //"all successors of any prefix are sorted and written in contiguous lines!"//This method also loads files processed with the quantization //tool: qlmint parseline(istream& inp, int Order,ngram& ng,float& prob,float& bow){ char* words[1+ LMTMAXLEV + 1 + 1]; int howmany; char line[MAX_LINE]; inp.getline(line,MAX_LINE); if (strlen(line)==MAX_LINE-1){ cerr << "parseline: input line exceed MAXLINE (" << MAX_LINE << ") chars " << line << "\n"; exit(1); } howmany = parseWords(line, words, Order + 3); assert(howmany == (Order+ 1) || howmany == (Order + 2)); //read words ng.size=0; for (int i=1;i<=Order;i++) ng.pushw(strcmp(words[i],"<unk>")?words[i]:ng.dict->OOV()); //read logprob/code and logbow/code assert(sscanf(words[0],"%f",&prob)); if (howmany==(Order+2)) assert(sscanf(words[Order+1],"%f",&bow)); else bow=0.0; //this is log10prob=0 for implicit backoff return 1;}void lmtable::loadcenters(istream& inp,int Order){ char line[MAX_LINE]; //first read the coodebook cerr << Order << " read code book "; inp >> NumCenters[Order]; Pcenters[Order]=new float[NumCenters[Order]]; Bcenters[Order]=(Order<maxlev?new float[NumCenters[Order]]:NULL); for (int c=0;c<NumCenters[Order];c++){ inp >> Pcenters[Order][c]; if (Order<maxlev) inp >> Bcenters[Order][c]; }; //empty the last line inp.getline((char*)line,MAX_LINE); }void lmtable::loadtxt(istream& inp,const char* header){ //open input stream and prepare an input string char line[MAX_LINE]; //prepare word dictionary //dict=(dictionary*) new dictionary(NULL,1000000,NULL,NULL); dict->incflag(1); //put here ngrams, log10 probabilities or their codes ngram ng(dict); float prob,bow;; //check the header to decide if the LM is quantized or not isQtable=(strncmp(header,"qARPA",5)==0?true:false); //we will configure the table later we we know the maxlev; bool yetconfigured=false; cerr << "loadtxt()\n"; // READ ARPA Header int Order,n; while (inp.getline(line,MAX_LINE)){ if (strlen(line)==MAX_LINE-1){ cerr << "lmtable::loadtxt: input line exceed MAXLINE (" << MAX_LINE << ") chars " << line << "\n"; exit(1); } bool backslash = (line[0] == '\\'); if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) { maxsize[Order] = n; maxlev=Order; //upadte Order } if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) { //at this point we are sure about the size of the LM if (!yetconfigured){ configure(maxlev,isQtable);yetconfigured=true; //allocate space for loading the table of this level for (int i=1;i<=maxlev;i++) table[i]= new char[maxsize[i] * nodesize(tbltype[i])]; } cerr << Order << "-grams: reading "; if (isQtable) loadcenters(inp,Order); //allocate support vector to manage badly ordered n-grams if (maxlev>1 && Order<maxlev) { startpos[Order]=new int[maxsize[Order]]; for (int c=0;c<maxsize[Order];c++) startpos[Order][c]=-1; } //prepare to read the n-grams entries cerr << maxsize[Order] << " entries\n"; //WE ASSUME A WELL STRUCTURED FILE!!! for (int c=0;c<maxsize[Order];c++){ if (parseline(inp,Order,ng,prob,bow)) add(ng, (int)(isQtable?prob:*((int *)&prob)), (int)(isQtable?bow:*((int *)&bow))); } // now we can fix table at level Order -1 if (maxlev>1 && Order>1) checkbounds(Order-1); } } dict->incflag(0); cerr << "done\n"; }//Checkbound with sorting of n-gram table on diskvoid lmtable::checkbounds(int level){ char* tbl=table[level]; char* succtbl=table[level+1]; LMT_TYPE ndt=tbltype[level], succndt=tbltype[level+1]; int ndsz=nodesize(ndt), succndsz=nodesize(succndt); //re-order table at level+1 on disk //generate random filename to avoid collisions ofstream out;string filePath; createtempfile(out,filePath,ios::out); int start,end,newstart; //re-order table at level l+1 newstart=0; for (int c=0;c<cursize[level];c++){ start=startpos[level][c]; end=bound(tbl+c*ndsz,ndt); //is start==-1 there are no successors for this entry and end==-2 if (end==-2) end=start; assert(start<=end); assert(newstart+(end-start)<=cursize[level+1]); assert(end<=cursize[level+1]); if (start<end) out.write((char*)(succtbl + start * succndsz),(end-start) * succndsz); bound(tbl+c*ndsz,ndt,newstart+(end-start)); newstart+=(end-start); } out.close(); fstream inp(filePath.c_str(),ios::in); inp.read(succtbl,cursize[level+1]*succndsz); inp.close(); removefile(filePath);}//Add method inserts n-grams in the table structure. It is ONLY used during //loading of LMs in text format. It searches for the prefix, then it adds the //suffix to the last level and updates the start-end positions. int lmtable::add(ngram& ng,int iprob,int ibow){ char *found; LMT_TYPE ndt; int ndsz; if (ng.size>1){ // find the prefix starting from the first level int start=0, end=cursize[1]; for (int l=1;l<ng.size;l++){ ndt=tbltype[l]; ndsz=nodesize(ndt); if (search(l,start,(end-start),ndsz, ng.wordp(ng.size-l+1),LMT_FIND, &found)){ //update start-end positions for next step if (l< (ng.size-1)){ //set start position if (found==table[l]) start=0; //first pos in table else start=bound(found - ndsz,ndt); //end of previous entry //set end position end=bound(found,ndt); } } else{ cerr << "warning: missing back-off for ngram " << ng << "\n"; return 0; } } // update book keeping information about level ng-size -1. // if this is the first successor update start position int position=(found-table[ng.size-1])/ndsz; if (startpos[ng.size-1][position]==-1) startpos[ng.size-1][position]=cursize[ng.size]; //always update ending position bound(found,ndt,cursize[ng.size]+1); //cout << "startpos: " << startpos[ng.size-1][position] //<< " endpos: " << bound(found,ndt) << "\n"; } // just add at the end of table[ng.size] assert(cursize[ng.size]< maxsize[ng.size]); // is there enough space? ndt=tbltype[ng.size];ndsz=nodesize(ndt); found=table[ng.size] + (cursize[ng.size] * ndsz); word(found,*ng.wordp(1)); prob(found,ndt,iprob); if (ng.size<maxlev){bow(found,ndt,ibow);bound(found,ndt,-2);} cursize[ng.size]++; return 1; }void *lmtable::search(int lev, int offs, int n, int sz, int *ngp, LMT_ACTION action, char **found){ //assume 1-grams is a 1-1 map of the vocabulary if (lev==1) return *found=(*ngp <n ? table[1] + *ngp * sz:NULL); //prepare table to be searched with mybserach char* tb; tb=table[lev]+(sz * offs); //prepare search pattern char w[LMTCODESIZE];putmem(w,ngp[0],0,LMTCODESIZE); int idx=0; // index returned by mybsearch *found=NULL; //initialize output variable totbsearch[lev]++; switch(action){ case LMT_FIND: if (!tb || !mybsearch(tb,n,sz,(unsigned char *)w,&idx)) return NULL; else return *found=tb + (idx * sz); default: error("lmtable::search: this option is available"); }; return NULL;}int lmtable::mybsearch(char *ar, int n, int size, unsigned char *key, int *idx){ register int low, high; register unsigned char *p; register int result; register int i; /* return idx with the first position equal or greater than key */ /* Warning("start bsearch \n"); */ low = 0;high = n; *idx=0; while (low < high) { *idx = (low + high) / 2; p = (unsigned char *) (ar + (*idx * size)); //comparison for (i=(LMTCODESIZE-1);i>=0;i--){ result=key[i]-p[i]; if (result) break; } if (result < 0) high = *idx; else if (result > 0) low = *idx + 1; else return 1; } *idx=low; return 0; }// saves a LM table in text formatvoid lmtable::savetxt(const char *filename){ fstream out(filename,ios::out); int l; out.precision(7); if (isQtable) out << "qARPA\n"; ngram ng(dict,0); cerr << "savetxt: " << filename << "\n"; out << "\n\\data\\\n"; for (l=1;l<=maxlev;l++){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -