📄 hlm.c
字号:
break; if (i!=j || k==0) HError(8150,"ReadBoNGram: %dGram count missing (%s)",i,buf); switch (ngFmtCh) { case '=': ngBin[j] = FALSE; break; case '~': ngBin[j] = TRUE; break; default: HError (9999, "ReadARPALM: unknown ngram format type '%c'", ngFmtCh); } counts[j]=k; } if (ngBin[1]) HError (8113, "ReadARPALM: unigram must be stored as text"); nglm=CreateBoNGram(lm,counts[1],counts); for (i=1;i<=nglm->nsize;i++) { sprintf(syc,"\\%d-grams:",i); SyncStr(buf,syc); ReadNGrams(nglm,i,nglm->counts[i], ngBin[i]); } SyncStr(buf,"\\end\\"); CloseSource(&source); if (trace&T_TIO) { printf("\n NEntry==%d ",nglm->counts[0]); for(i=1;i<=nglm->nsize;i++) printf(" %d-Grams==%d",i,nglm->counts[i]); printf("\n\n"); fflush(stdout); }}/* WriteBoNGram: write out WSJ/DP format ngram */static void WriteBoNGram(LModel *lm,char *fn,int flags){ int i,k; FILE *file; NGramLM *nglm; Boolean isPipe; nglm = lm->data.ngram; file=FOpen(fn,LangModOFilter,&isPipe); fprintf(file,"\\data\\\n"); for (i=1;i<=nglm->nsize;i++) { fprintf(file,"ngram %d=%d\n",i,nglm->counts[i]); } for (i=1;i<=nglm->nsize;i++) { k = WriteNGrams(file,nglm,i,1.0/LN10); if (k!=nglm->counts[i]) HError(-8190,"WriteBoNGram: Counts disagree for %dgram (%d vs %d)", i, k, nglm->counts[i]); } fprintf(file,"\n\\end\\\n"); FClose(file,isPipe);}void ClearBoNGram(LModel *lm){ NGramLM *nglm = lm->data.ngram; int i; for(i=1;i<=nglm->vocSize;i++) if (nglm->wdlist[i]!=NULL) nglm->wdlist[i]->aux=0;}/* -------------- Matrix Bigram Handling Routines ----------- */MatBiLM *CreateMatBigram(LModel *lm,int nw){ MatBiLM *matbi; matbi = (MatBiLM *) New(lm->heap,sizeof(MatBiLM)); lm->data.matbi = matbi; matbi->heap = lm->heap; matbi->numWords = nw; matbi->wdlist = (LabId *) New(lm->heap,sizeof(LabId)*(nw+1)); matbi->bigMat = CreateMatrix(lm->heap,nw,nw); ZeroMatrix(matbi->bigMat); return(matbi);}/* ReadRow: read a row from bigram file f into v */int ReadRow(Vector v){ int i,j,N,cnt,c; float x; N = VectorSize(v); i=0; while(!source.wasNewline) { x = GetFloat(FALSE); c=GetCh(&source); if (c == '*') cnt=GetInt(); else { UnGetCh(c,&source); cnt=1; } SkipWhiteSpace(&source); for (j=0;j<cnt;j++) { i++; if (i<=N) v[i] = x; } } return(i);}/* ReadBigram: load a bigram from given file */static void ReadMatBigram(LModel *lm,char *fn){ Vector vec; char buf[132]; int P,p,j; float sum,x; LabId id; MatBiLM *matbi; if (trace&T_TIO) printf("\nMB "),fflush(stdout); if(InitSource(fn,&source,LangModFilter)<SUCCESS) HError(8110,"ReadMatBigram: Can't open file %s", fn); vec = CreateVector(&gcheap,MAX_LMID); ReadLMWord(buf);SkipWhiteSpace(&source); id=GetLabId(buf,TRUE); P = ReadRow(vec); if (P<=0 || P >MAX_LMID) HError(8151,"ReadMatBigram: First row invalid (%d entries)",P); matbi=CreateMatBigram(lm,P); matbi->wdlist[1] = id; for (p=1;p<=P;p++) matbi->bigMat[1][p]=vec[p]; id->aux=(Ptr) 1; Dispose(&gcheap,vec); for (sum=0.0, j=1; j<=P; j++) { x = matbi->bigMat[1][j]; if (x<0) HError(8151,"ReadMatBigram: In bigram, entry %d for %s is -ve (%e)", j,buf,x); sum += x; matbi->bigMat[1][j]=((x<MINLARG)?LZERO:log(x)); } if (sum < 0.99 || sum > 1.01) HError(-8151,"ReadMatBigram: Row %d of bigram %s adds up to %f",1,fn,sum); for (p=2; ReadLMWord(buf); p++) { if (trace&T_TIO) { if ((p%25)==0) printf(". "),fflush(stdout); if ((p%800)==0) printf("\n "),fflush(stdout); } if (p>P) HError(8150,"ReadMatBigram: More rows than columns in bigram %s",fn); id=GetLabId(buf,TRUE); if ((int)id->aux != 0) HError(8150,"ReadMatBigram: Duplicated name %s in bigram %s",buf,fn); id->aux = (Ptr) p; matbi->wdlist[p] = id; SkipWhiteSpace(&source); if (ReadRow(matbi->bigMat[p])!=P) HError(8150,"ReadMatBigram: Wrong number of items in row %d",p); for (sum=0.0, j=1; j<=P; j++) { x = matbi->bigMat[p][j]; if (x<0) HError(8151,"ReadMatBigram: In bigram, entry %d for %s is -ve (%e)", j,buf,x); sum += x; matbi->bigMat[p][j]=((x<MINLARG)?LZERO:log(x)); } if (sum < 0.99 || sum > 1.01) HError(-8151,"ReadMatBigram: Row %d of bigram %s adds up to %f",p,fn,sum); } if (P>p) HError(8150,"ReadMatBigram: More columns than rows in bigram %s",fn); if (trace&T_TIO) printf("\n"),fflush(stdout); CloseSource(&source);}/* WriteMatBigram: write out old HVite format bigram */static void WriteMatBigram(LModel *lm,char *fn,int flags){ const float epsilon = 0.000001; MatBiLM *matbi; FILE *file; Boolean isPipe; Vector v; double x,y; int i,j,rep; if (trace&T_TIO) printf("\nMB "),fflush(stdout); matbi = lm->data.matbi; file=FOpen(fn,LangModOFilter,&isPipe); for (i=1;i<=matbi->numWords;i++) { if (trace&T_TIO) { if ((i%25)==0) printf(". "),fflush(stdout); if ((i%800)==0) printf("\n "),fflush(stdout); } fprintf(file,"%-8s ",ReWriteString(matbi->wdlist[i]->name, NULL,ESCAPE_CHAR)); v=matbi->bigMat[i];rep=0;x=-1.0; for (j=1;j<=matbi->numWords;j++){ y = L2F(v[j]); if (fabs(y - x) <= epsilon) rep++; else { if (rep>0) { fprintf(file,"*%d",rep+1); rep=0; } x = y; if (x == 0.0) fprintf(file," 0"); else if (x == 1.0) fprintf(file," 1"); else fprintf(file," %e",x); } } if (rep>0) fprintf(file,"*%d",rep+1); fprintf(file,"\n"); } FClose(file,isPipe); if (trace&T_TIO) printf("\n"),fflush(stdout);}/*------------------------- User Interface --------------------*//* EXPORT GetLMProb: return probability of word wd_id following pr_id[] */float GetLMProb(LModel *lm, LabId prid[NSIZE], LabId wdid){ LabId cpid[NSIZE]; NEntry *ne; SEntry *se; lmId p, q, word, ndx[NSIZE]; LogFloat bowt,prob; int i, s; switch (lm->type) { case boNGram: word = (int)wdid->aux; if (word==0 || word>lm->data.ngram->vocSize) return(LZERO); for (s=-1,i=0;i<NSIZE;i++) if (prid[i]!=NULL) ndx[i]=(int)prid[i]->aux, cpid[i]=prid[i], s=i; else ndx[i]=0, cpid[i]=NULL; /* If no answer back-off to unigram */ if (s<0) { if (word!=0) return(lm->data.ngram->unigrams[word]); else return(log(1.0/lm->data.ngram->vocSize)); } cpid[s]=0; ne = GetNEntry(lm->data.ngram,ndx,FALSE); if (ne) { /* Replace with bsearch equivalent */ for (i=0, se=ne->se; i<ne->nse; i++,se++) if (se->word==word) return(se->prob); /* Ngram found */ bowt=ne->bowt; } else { bowt=0.0; } if (s==0) return(lm->data.ngram->unigrams[word]+bowt); /* Backoff to unigram */ else return(bowt+GetLMProb(lm,cpid,wdid)); /* else recurse */ break; case matBigram: p=(int) prid[0]->aux; q=(int) wdid->aux; return(lm->data.matbi->bigMat[p][q]); default: prob=LZERO; } return(prob);}/* EXPORT ReadLModel: Determine LM type and then read-in */LModel *ReadLModel(MemHeap *heap,char *fn){ LModel *lm; LMType type; char buf[MAXSTRLEN+1]; int i; lm=(LModel*)New(heap,sizeof(LModel)); lm->heap=heap; lm->name=CopyString(heap,fn); if(InitSource(fn,&source,LangModFilter)<SUCCESS) HError(8110,"ReadLModel: Can't open file %s", fn); type=boNGram;i=0; do { if (i++==1000) { type=matBigram; break; } GetInLine(buf); } while (strcmp(buf, "\\data\\")!=0); CloseSource(&source); lm->type=type; switch(type) { case boNGram: ReadBoNGram(lm,fn); break; case matBigram: ReadMatBigram(lm,fn); break; } return(lm);}/* EXPORT WriteLModel: Determine LM type and then write-out */void WriteLModel(LModel *lm,char *fn,int flags){ switch(lm->type) { case boNGram: WriteBoNGram(lm,fn,flags); break; case matBigram: WriteMatBigram(lm,fn,flags); break; }}void ClearLModel(LModel *lm){ switch(lm->type) { case boNGram: ClearBoNGram(lm); break; case matBigram: break; }}/*----------------------------------------------------------------------*/#ifndef NO_LAT_LM/* FindSEntry find SEntry for wordId in array using binary search*/static SEntry *FindSEntry (SEntry *se, lmId pronId, int l, int h){ /*#### here l,h,c must be signed */ int c; while (l <= h) { c = (l + h) / 2; if (se[c].word == pronId) return &se[c]; else if (se[c].word < pronId) l = c + 1; else h = c - 1; } return NULL;}/* LMTransProb_ngram return logprob of transition from src labelled word. Also return dest state. ngram case*/LogFloat LMTrans (LModel *lm, LMState src, LabId wdid, LMState *dest){ NGramLM *nglm; LogFloat lmprob; lmId hist[NSIZE] = {0}; /* initialise whole array to zero! */ int i, l; NEntry *ne; SEntry *se; lmId word; assert (lm->type == boNGram); nglm = lm->data.ngram; word = (int) wdid->aux; if (word==0 || word>lm->data.ngram->vocSize) { HError (-9999, "word %d not in LM wordlist", word); *dest = NULL; return (LZERO); } ne = src; if (!src) { /* unigram case */ lmprob = nglm->unigrams[word]; } else { /* lookup prob p(word | src) */ /* try to find pronid in SEntry array */ se = FindSEntry (ne->se, word, 0, ne->nse - 1); assert (!se || (se->word == word)); if (se) /* found */ lmprob = se->prob; else { /* not found */ lmprob = 0.0; l = 0; hist[NSIZE-1] = 0; for (i = 0; i < NSIZE-1; ++i) { hist[i] = ne->word[i]; if (hist[i] != 0) l = i; } /* l is now the index of the last (oldest) non zero element */ for ( ; l > 0; --l) { if (ne) lmprob += ne->bowt; hist[l] = 0; /* back-off: discard oldest word */ ne = GetNEntry (nglm, hist, FALSE); if (ne) { /* skip over non existing hists. fix for weird LMs */ /* try to find pronid in SEntry array */ se = FindSEntry (ne->se, word, 0, ne->nse - 1); assert (!se || (se->word == word)); if (se) { /* found it */ lmprob += se->prob; l = -1; break; } } } if (l == 0) { /* backed-off all the way to unigram */ assert (!se); lmprob += ne->bowt; lmprob += nglm->unigrams[word]; } } } /* now determine dest state */ if (src) { ne = (NEntry *) src; l = 0; hist[NSIZE-1] = 0; for (i = 1; i < NSIZE-1; ++i) { hist[i] = ne->word[i-1]; if (hist[i] != 0) l = i; } /* l is now the index of the last (oldest) non zero element */ } else { for (i = 1; i < NSIZE-1; ++i) hist[i] = 0; l = 1; } hist[0] = word; ne = (LMState) GetNEntry (nglm, hist, FALSE); for ( ; !ne && (l > 0); --l) { hist[l] = 0; /* back off */ ne = (LMState) GetNEntry (nglm, hist, FALSE); } /* if we left the loop because l=0, then ne is still NULL, which is what we want */ *dest = ne;#if 0 printf ("lmprob = %f dest %p\n", lmprob, *dest);#endif return (lmprob);}#endif/* ------------------------- End of HLM.c ------------------------- */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -