📄 lgbase.c
字号:
/* ----------------------------------------------------------- *//* *//* ___ *//* |_| | |_/ SPEECH *//* | | | | \ RECOGNITION *//* ========= SOFTWARE */ /* *//* *//* ----------------------------------------------------------- *//* developed at: *//* *//* Speech Vision and Robotics group *//* Cambridge University Engineering Department *//* http://svr-www.eng.cam.ac.uk/ *//* *//* main authors: Valtcho Valtchev, Steve Young, *//* Julian Odell, Gareth Moore *//* ----------------------------------------------------------- *//* Copyright: *//* *//* 1994-2002 Cambridge University *//* Engineering Department *//* *//* Use of this software is governed by a License Agreement *//* ** See the file License for the Conditions of Use ** *//* ** This banner notice must not be removed ** *//* *//* ----------------------------------------------------------- *//* File: LGBase: Gram File Database Routines *//* ----------------------------------------------------------- */char *lgbase_version = "!HVER!LGBase: 3.3 [CUED 28/04/05]";char *lgbase_vc_id = "$Id: LGBase.c,v 1.1.1.1 2005/05/12 10:52:18 jal58 Exp $";#include "HShell.h"#include "HMem.h"#include "HMath.h"#include "HWave.h"#include "HLabel.h"#include "LUtil.h"#include "LWMap.h"#include "LGBase.h"/* ------------------------ Trace Flags --------------------- */static int trace = 0;#define T_TOP 0001 /* top level tracing */#define T_SQU 0002 /* trace squashing */#define T_SRT 0004 /* trace NG Buffer sorting */#define T_ITR 0010 /* print NG input set tree */#define T_MOP 0020 /* print max parallel input streams */#define T_IST 0040 /* trace parallel input streaming */#define T_FOF 0100 /* print info on FoF i/o *//* --------------------- Global Variables ------------------- */static ConfParam *cParm[MAXGLOBS]; /* config parameters */static int nParm = 0;static int sqOffset; /* squash offset, this depends on byte */static Boolean checkOrder = FALSE; /* Check n-gram ordering */static Boolean natReadOrder = FALSE; /* Preserve natural read byte order */static Boolean natWriteOrder = FALSE; /* Preserve natural write byte order */extern Boolean vaxOrder; /* True if byteswapping needed to preserve SUNSO *//* --------------------- Initialisation --------------------- *//* EXPORT -> InitGBase: initialise the module for n-grams */void InitGBase(void){ int i; Boolean b; Register(lgbase_version,lgbase_vc_id); /* get config variables for this module */ nParm = GetConfig("LGBASE", TRUE, cParm, MAXGLOBS); if (nParm>0){ if (GetConfInt(cParm,nParm,"TRACE",&i)) trace = i; if (GetConfBool(cParm,nParm,"NATURALREADORDER",&b)) natReadOrder = b; if (GetConfBool(cParm,nParm,"NATURALWRITEORDER",&b)) natWriteOrder = b; if (GetConfBool(cParm,nParm,"CHECKORDER",&b)) checkOrder = b; } /* Set byte order */ sqOffset = sizeof(UInt) - SQUASH; if (trace&T_SQU) printf("Squash offset is %d\n",sqOffset);}/* SetNGInfo: init info struct for given N-gram */static NGInfo SetNGInfo(int N){ NGInfo i; i.N = N; i.ng_size = N*SQUASH + 1; i.ng_full = (N+1)*sizeof(UInt); return i;}/* ------------------- Squashing routines ---------------------- *//* EXPORT->NGramSquash: compress each ngram to SQUASH bytes */void NGramSquash(int N, NGram ng, Byte *comp){ int i; UInt b; Byte *e,*c; Boolean mustSwap = (vaxOrder && !natWriteOrder); for (c = comp,i=0; i<N; i++, c+=SQUASH) { b = ng[i]; e = (Byte *) &b; if (mustSwap) SwapInt32((int *)&b); memcpy(c,e+sqOffset,SQUASH); }}/* EXPORT -> NGramExpand: expand ngrams from SQUASH num of bytes */void NGramExpand(int N, Byte *comp, NGram ng){ int i; UInt b; Byte *e,*c; Boolean mustSwap = (vaxOrder && !natReadOrder); for (c=comp,i=0; i<N; i++,c+=SQUASH){ e = (Byte *) &b; memset(e,0x00,sizeof(UInt)); memcpy(e+sqOffset,c,SQUASH); if (mustSwap) SwapInt32((int *)&b); ng[i] = b; } ng[N] = 0;}/* EXPORT -> SameGrams: true if grams (ignoring counts) are equal */Boolean SameGrams(int N, NGram ng1, NGram ng2){ int i; for (i=0; i<N; i++) if (ng1[i] != ng2[i]) return FALSE; return TRUE;}/* ------------------- NGram File Input/Output --------------- *//* EXPORT->PrintNGram: print given N-gram */void PrintNGram(int N, NGram ng, WordMap *wm){ int i; LabId id; for (i=0; i<N; i++) { id = WordLMName(ng[i],wm); printf("%-12s",id->name); } printf(" : %d\n",ng[N]);}/* LoadHGram: read text N-gram from header */static void ReadHGram(char *name, LMFileHdr hdr, int N, LabId *ng, char *fn){ int i; char *s,sbuf[MAXSTRLEN]; if ((s=GetLMHdrStr(name,hdr,FALSE)) == NULL) HError(15350,"ReadHGram: No %s field in %s",name,fn); strcpy(sbuf,s); for (i=0; i<N; i++){ s = strtok((i==0)?sbuf:NULL," \t\r\n"); if (s==NULL) HError(15350,"ReadHGram: Missing Sep in %s in %s",name,fn); ng[i] = GetLabId(s,TRUE); }}/* WriteHGram: write text N-gram to header */static void WriteTxtHGram(FILE *f, char *name, int N, LabId *ng){ int i; fprintf(f,"%s =",name); for (i=0; i<N; i++) { fprintf(f," %s",ng[i]->name); } fprintf(f,"\n");}/* WriteHGram: write a header for given NG Buffer */void WriteRawHGram(FILE *f, char *name, int N, NGram ng, WordMap *wm){ int i; LabId id; fprintf(f,"%s =",name); for (i=0; i<N; i++) { id = WordLMName(ng[i],wm); fprintf(f," %s",id->name); } fprintf(f,"\n");}/* SameHGrams: compare raw and text N-grams */static Boolean SameHGrams(int N, NGram ng, LabId *tg){ int i,ndx; for (i=0; i<N; i++) { if ((ndx = WordLMIndex(tg[i]))!=-1 && ndx!=ng[i]) return FALSE; } return TRUE;}/* CmpTxtNGram: compare two N-grams in text */static int CmpTxtNGram(int N, LabId *ng1, LabId *ng2){ int i,cmp; for (i=0; i<N; i++) { cmp = strcmp(ng1[i]->name,ng2[i]->name); if (cmp != 0) return cmp; } return 0;}/* CompareMapNames: compare map name and n-gram file map name */static Boolean CompareMapNames(char *ngfMap, char *master){ char *s; if (ngfMap==NULL || master==NULL) return FALSE; if ((s=strstr(master,ngfMap))==NULL) return FALSE; if (s!=master && *(s-1)!='%') /* not at the beginning and not preceeded by % */ return FALSE; s += strlen(ngfMap); if (*s!='\0' && *s!='%') /* not at the end and not followed by % */ return FALSE; return TRUE;}/* SetNext: initialise ngs->nxt array with the first N-gram with all words in the map. */static void SetNext(NGSource *ngs, Byte ngRawBuf[GSIZE]){ UInt *gp; int i, N, ng_size; Boolean same, hasOOM; N = ngs->info.N; ng_size = ngs->info.ng_size; while(ngs->nItems > 0) { memcpy(ngs->buf,ngRawBuf,ng_size); NGramExpand(N,ngs->buf,ngs->nxt); hasOOM = FALSE; for (gp=ngs->nxt,i=0; i<N; i++,gp++) { if (GetMEIndex(ngs->wm,*gp) < 0) { hasOOM = TRUE; break; } } if (hasOOM) { /* skip remaining N-grams, same as ngs->buf */ ngs->nItems--; do { if (fread(ngRawBuf,ng_size,1,ngs->src.f)==1) { same = memcmp(ngs->buf,ngRawBuf,ng_size-1) == 0; } else { same = FALSE; } } while(same); } else { break; } }}/* EXPORT->OpenNGramFile: open an ngram file and init NGSource */void OpenNGramFile(NGSource *ngs, char *fn, WordMap *wm){ LMFileHdr hdr; MemHeap mem; int i,n,N; char *s,buf[MAXSTRLEN]; Byte ngRawBuf[GSIZE]; UInt ngExpBuf[GSIZE]; /* Create and Load Header */ CreateHeap(&mem,"NGheader",MSTAK,1,0.0,1000,1000); if (InitSource(fn, &(ngs->src), LGramFilter) == FAIL) HError(15311,"OpenNGramFile: Can't open gram file '%s'", fn); if (ReadLMHeader(&mem, &(ngs->src), LGramFilter, &hdr, &n) != GRAM_HDR) HError(15350,"OpenNGramFile: Bad header in file %s",fn); ngs->nItems = n; /* Check Word map name and seqno */ if ((s=GetLMHdrStr("WMAP",hdr,FALSE)) == NULL) HError(15350,"OpenNGramFile: No WMap field in %s",fn); if (!CompareMapNames(s,wm->name)) HError(15330,"OpenNGramFile: Gram file map %s inconsistent with %s", s,wm->name); if (!GetLMHdrInt("SEQNO",&n,hdr)) HError(15350,"OpenNGramFile: No SeqNo field in %s",fn); if (n > wm->seqno) HError(15330,"OpenNGramFile: SeqNo of map file is too low [%d vs %d]", n,wm->seqno); /* Check map matches WMCHECK */ if ((s=GetLMHdrStr("WMCHECK",hdr,FALSE)) == NULL) HError(15350,"OpenNGramFile: No WMCheck field in %s",fn); strcpy(buf,s); if ((s=strchr(buf,' ')) == NULL) HError(15350,"OpenNGramFile: Missing Sep in WMCheck in %s",fn); *s = '\0'; n = atoi(s+1); if ((i=WordLMIndex(GetLabId(buf,FALSE)))!=-1 && i!=n) HError(15330,"OpenNGramFile: WMCheck FAILURE in %s, %d vs %d",fn,i,n); /* Ok, So Get Rest of Header Info */ if (!GetLMHdrInt("NGRAM",&N,hdr)) HError(15350,"OpenNGramFile: No Ngram field in %s",fn); ngs->info = SetNGInfo(N); s = GetLMHdrStr("SOURCE",hdr,FALSE); if (s==NULL) ngs->txtsrc[0] = '\0'; else strcpy(ngs->txtsrc,s); ReadHGram("GRAM1",hdr,N,ngs->firstGram,fn); ReadHGram("GRAMN",hdr,N,ngs->lastGram,fn); ngs->wm = wm; if (trace&T_TOP) { printf("Read Header for %s, [%d grams, size %d]\n",fn,ngs->nItems,N); fflush(stdout); } /* initialise the source by reading the first gram */ if (fread(ngRawBuf,ngs->info.ng_size,1,ngs->src.f) !=1 ) HError(15350, "OpenNGramFile: Empty file %s\n", fn); NGramExpand(N,ngRawBuf,ngExpBuf); if (!SameHGrams(N,ngExpBuf,ngs->firstGram)) { WriteTxtHGram(stdout,"Gram1",N,ngs->firstGram); WriteRawHGram(stdout,"gram1",N,ngExpBuf,wm); HError(15330, "OpenNGramFile: Header-specified 1st gram is not equal to the actual 1st gram in file %s\n", fn); } SetNext(ngs,ngRawBuf); /* This could well exhaust the file and reduce nItems to 0 */ DeleteHeap(&mem);}/* EXPORT->CloseNGramFile: close given ngram file source */void CloseNGramFile(NGSource *ngs){ CloseSource(&(ngs->src));}/* EXPORT->ReadNGram: read the next ngram from given source. (The next ngram to read will already be in its buffer) */void ReadNGram(NGSource *ngs, NGram ng){ UInt a,oc,N,ng_size; Byte c,b[GSIZE]; Boolean same; if (ngs->nItems <= 0) HError(15313,"ReadNGram: Gram file %s is empty",ngs->src.name); ngs->nItems--; oc = 0; a = 1; N = ngs->info.N; ng_size = ngs->info.ng_size; c = ngs->buf[ng_size-1]; do { oc += a*c; a *= 256; if (fread(b, ng_size, 1, ngs->src.f)==1) { same = memcmp(ngs->buf, b, ng_size-1) == 0; c = b[ng_size-1]; } else { same = FALSE; } } while (same); NGramExpand(N,ngs->buf,ng); ng[N] = oc; SetNext(ngs,b);}/* EXPORT -> WriteNGram: write compressed nGram to file f */int WriteNGram(FILE *f, int N, NGram ng){ Byte b; UInt a,c,bsize,count; static Byte buf[GSIZE]; NGramSquash(N, ng,buf); bsize = N*SQUASH;#ifdef LM_FLOAT_COUNT count = (UInt) *((float *)(ng + N))#else count = ng[N];#endif for (a=count,c=0; a != 0; a = a / 256, c++) { b = a % 256; fwrite(buf, bsize, 1, f); fwrite(&b, sizeof(Byte), 1, f); } return c;}/* --------------------- NGram Buffer Handling --------------- *//* EXPORT->CreateNGBuffer: Create an N-gram buffer with size slots */NGBuffer *CreateNGBuffer(MemHeap *mem, int N, int size, char *fn, WordMap *wm){ NGBuffer *ngb; UInt poolbytes; ngb = (NGBuffer *)New(mem,sizeof(NGBuffer)); ngb->info = SetNGInfo(N); ngb->poolsize = size; ngb->wm = wm; ngb->used = 0; ngb->fn = CopyString(mem,fn); ngb->fndx = 0; poolbytes = ngb->info.ng_full*size; ngb->next = ngb->pool = (UInt *) New(mem,poolbytes); return ngb;}/* EXPORT->StoreNGram: store ngram in buf into ngb, return TRUE if ngb is full */Boolean StoreNGram(NGBuffer *ngb, NGram ng){ memcpy(ngb->next, ng, ngb->info.ng_full); ngb->used++; ngb->next += ngb->info.N+1; return (ngb->used==ngb->poolsize);}/* CmpNGram: compare N-grams ng1 and ng2 using word map wm */static int CmpNGram(WordMap *wm, int N, UInt *ng1, UInt *ng2){ int i1,i2,j,s1,s2;#ifdef SANITY if (wm == NULL) HError(15390,"WordLMCmp: Word map is NULL"); if (!wm->isSorted) HError(15390,"WordLMCmp: Word map is not sorted");#endif for (j=0; j<N; j++) { if ((i1 = GetMEIndex(wm,ng1[j])) < 0) HError(15395,"WordLMCmp: Index %d not found in wordmap",ng1[j]); if ((i2 = GetMEIndex(wm,ng2[j])) < 0) HError(15395,"WordLMCmp: Index %d not found in wordmap",ng2[j]); s1 = wm->me[i1].sort; s2 = wm->me[i2].sort; if (s1 < s2) return -1; if (s1 > s2) return +1; } return 0;}static int qs_cmpSize; /* must set before using this routine */static WordMap *qs_wmap; /* word list to access mapentries */static NGInputSet *qs_inset; /* input set *//* qs_CmpNGram: compare two N-grams, used in qsort */static int qs_CmpNGram(const void *p1, const void *p2){ return CmpNGram(qs_wmap,qs_cmpSize,(UInt *)p1,(UInt *)p2);}/* qs_CmpGFile: compare two NGSources on nxt field */static int qs_CmpGFile(const void *p1, const void *p2){ NGram p,q; int *i1, *i2; i1 = (int *)p1; i2 = (int *)p2; p = qs_inset->ngs[*i1].nxt; q = qs_inset->ngs[*i2].nxt; return CmpNGram(qs_inset->wm,qs_inset->N,p,q);}/* EXPORT->SortNGBuffer: sort+uniqe N-grams in ngb */void SortNGBuffer(NGBuffer *ngb){ int i, count, isize, N;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -