⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 lmodel.c

📁 隐马尔科夫模型工具箱
💻 C
📖 第 1 页 / 共 5 页
字号:
   ptype = lm->probType;   if (FRead(&cneCnt,sizeof(int32),1,src)!=1)      HError(15450,"LoadUltraNGrams: Unable to read CNEntry count");   if (mustSwap) SwapInt32(&cneCnt);   if (FRead(&seCnt,sizeof(int32),1,src)!=1)      HError(15450,"LoadUltraNGrams: Unable to read SEntry count");   if (mustSwap) SwapInt32(&seCnt);   if (trace&T_LOAD)      printf("Loading %d NEntry(s)\n",cneCnt);   cneBuf = (CNEntry *) New(&gstack,cneCnt*sizeof(NEntry));   if (FRead(cneBuf,sizeof(CNEntry),cneCnt,src)!=cneCnt)      HError(15450,"LoadUltraNGrams: Unable to read CNEntry array");   if (mustSwap) {      for (cne=cneBuf,i=0; i<cneCnt; i++,cne++) SWAP_CE(cne);   }   if (trace&T_LOAD)      printf("Loading %d SEntry(s)\n",seCnt);   /* create and read SMEntry block */   smeBuf = (SMEntry *) New(lm->heap,seCnt*sizeof(SMEntry));   for (sme=smeBuf,i=0; i<seCnt; i++,sme++) {      if (FRead(&se,sizeof(SEntry),1,src)!=1)	 HError(15450,"LoadUltraNGrams: Unable to read SEntry (%d)",i);      if (mustSwap) SWAP_SE((&se));#ifdef LM_COMPACT      sme->prob = se.prob;#else      prob = PROB_SHORT_TO_LOG(se.prob);      sme->prob = (ptype==LMP_FLOAT) ? LOG10_TO_FLT(prob) : prob * scale;#endif      sme->ndx = se.word;   }   /* create table of pointers to SMEntry arrays */   smeTab = (SMEntry **) New(&gstack,cneCnt*sizeof(SMEntry *));   for (sme=smeBuf,j=0; j<cneCnt; j++) {     smeTab[j] = (cneBuf[j].nse==0) ? NULL : (SMEntry *) sme;     sme += cneBuf[j].nse;   }   /* create and sort lookup index array */   cneTab = (int *) New(&gstack,sizeof(int)*cneCnt);   for (i=0; i<cneCnt; i++) cneTab[i] = i;   qs_cneBuf = cneBuf;   qsort(cneTab,cneCnt,sizeof(int),nep_cmp);   feBuf = (FLEntry *) New(lm->heap,cneCnt*sizeof(FLEntry));   parent = &lm->root;   CNE2FE(cneTab[0],parent);   parent->nfe = 0;   parent->fea = cfe = feBuf;   parent->parent = 0;   for (i=0; i<NSIZE-1; i++) context[i] = cneBuf[cneTab[1]].word[i];   for (i=1; i<cneCnt; i++) {      cne = cneBuf+cneTab[i];      for (newCTX=FALSE,j=1; j<NSIZE-1; j++) {	 if (context[j]!=cne->word[j]) {	    newCTX=TRUE; break;	 }      }      if (newCTX) {	 for (parent=&lm->root,j=NSIZE-2; j>0; j--) {	    if ((idx=cne->word[j])==0) continue;	    if ((parent = FindFE(parent->fea,0,parent->nfe,idx))==NULL) {	       HError(15450,"LoadUltraNGrams: Items not in order %d",i);	    }	 }	 parent->fea = cfe; parent->nfe = 0;	 for (j=0; j<NSIZE-1; j++) context[j] = cne->word[j];      }      parent->nfe++; CNE2FE(cneTab[i],cfe); cfe++;   }   Dispose(&gstack,cneBuf);}static int WriteNEntry(FILE *f, BackOffLM *lm, int lev, FLEntry **feBuf,		       FLEntry **feTab, int *fetCount){   int i,total;   CNEntry ne;   float scale,bowt;   LMProbType ptype;   FLEntry *fe,*tgtFE;   Boolean mustSwap = (vaxOrder && !natWriteOrder);   if (lev==lm->nSize)      return 0;   ptype = lm->probType;   scale = 1.0/(lm->gScale*LN10);   tgtFE = feBuf[lev-1];   total = 0;   INIT_CNE(ne);   for (i=1; i<lev; i++) ne.word[lev-i] = feBuf[i]->ndx;   for (fe = tgtFE->fea, i=0; i<tgtFE->nfe; i++, fe++) {      if (fe->nse==0)	 continue;      feTab[(*fetCount)++] = fe;      ne.word[0] = fe->ndx;      ne.nse = fe->nse;      bowt = (ptype==LMP_FLOAT) ? FLT_TO_LOG10(fe->bowt) : fe->bowt*scale;  /* convert to LOG10 */      ne.bowt = BOWT_LOG_TO_SHORT(bowt);   /* compress LOG10 to short */      if (mustSwap) {	 SWAP_CE((&ne));	 fwrite(&ne,sizeof(CNEntry),1,f);	 SWAP_CE((&ne));      } else {	 fwrite(&ne,sizeof(CNEntry),1,f);      }      total++;   }   if (++lev < lm->nSize) {      for (fe = tgtFE->fea, i=0; i<tgtFE->nfe; i++, fe++) {	 feBuf[lev-1] = fe;	 total += WriteNEntry(f,lm,lev,feBuf,feTab,fetCount);      }   }   return total;}static int WriteSEntry(FILE *f,BackOffLM *lm,FLEntry **feTab, int fetCount){   SEntry se;   SMEntry *sme;   FLEntry *fe;   int i,j,total = 0;   float scale,prob;   LMProbType ptype;   Boolean mustSwap = (vaxOrder && !natWriteOrder);   ptype = lm->probType;   scale = 1.0/(lm->gScale*LN10);   total = 0;   for (i=0; i<fetCount; i++) {      fe = feTab[i];      for (sme=fe->sea,j=0; j<fe->nse; j++,sme++) {	 prob = (ptype==LMP_FLOAT) ? FLT_TO_LOG10(sme->prob) : sme->prob*scale;	 se.prob = PROB_LOG_TO_SHORT(prob);   /* LOG10 -> short */	 se.word = sme->ndx;	 if (mustSwap) {	    SWAP_SE((&se));	    fwrite(&se,sizeof(SEntry),1,f);	    SWAP_SE((&se));	 } else {	    fwrite(&se,sizeof(SEntry),1,f);	 }	 total++;      }   }   return total;}static void CountEntries(int lev, int nSize, FLEntry *tgtFE, int *nfe, int *nse){   int i;   FLEntry *fe;   *nse += tgtFE->nse;   if (lev < nSize)      *nfe += tgtFE->nfe;      for (fe = tgtFE->fea, i=0; i<tgtFE->nfe; i++, fe++)	 CountEntries(lev+1,nSize,fe,nfe,nse);}static void SaveUltraNGrams(FILE *f, BackOffLM *lm){   int n,neCnt,seCnt,fetCount;   CNEntry ne;   FLEntry *feBuf[LM_NSIZE], **feTab;   Boolean mustSwap = (vaxOrder && !natWriteOrder);   fprintf(f,"\n\\N-grams:\n");   neCnt = seCnt = 0;   CountEntries(1,lm->nSize,&lm->root,&neCnt,&seCnt);   neCnt++;   if (mustSwap) {      SwapInt32(&neCnt);      fwrite(&neCnt,sizeof(int32),1,f);      SwapInt32(&neCnt);   } else {      fwrite(&neCnt,sizeof(int32),1,f);   }   if (mustSwap) {      SwapInt32(&seCnt);      fwrite(&seCnt,sizeof(int32),1,f);      SwapInt32(&seCnt);   } else {      fwrite(&seCnt,sizeof(int32),1,f);   }   INIT_CNE(ne);                         /* write the root entry */   ne.nse = lm->root.nse;   if (mustSwap) {      SWAP_CE((&ne));      fwrite(&ne,sizeof(CNEntry),1,f);      SWAP_CE((&ne));   } else {      fwrite(&ne,sizeof(CNEntry),1,f);   }   feTab = (FLEntry **) New(&gstack,neCnt*sizeof(FLEntry *));   fetCount = 0;   feTab[fetCount++] = &lm->root;   if (lm->nSize > 1) {      feBuf[0] = &lm->root;      WriteNEntry(f,lm,1,feBuf,feTab,&fetCount);      if (trace&T_SAVE) {	 printf("saved %d CNEntry(s), (%d)\n",fetCount,neCnt); fflush(stdout);      }   }   n = WriteSEntry(f,lm,feTab,fetCount);   if (trace&T_SAVE) {      printf("saved %d SEntry(s), (%d)\n",n,seCnt); fflush(stdout);   }   Dispose(&gstack,feTab);}#endif  /* ULTRA_LM *//*------------------------- LM loading -------------------------*/#define READ_FLOAT(src,x,bin) { \   char buf[100]; \   if (!ReadFloat(src,x,1,bin)) \      HError(15490,"ReadFloat: Float expected at %s",SrcPosition(*src,buf)); \}/* EXPORT-> StoreFEA: move fea array into permanent location */void StoreFEA(FLEntry *fe, MemHeap *heap){   FLEntry *febuf;   if (fe==NULL)      return;   if (fe->nfe==0) {      fe->fea = NULL;   } else {      qsort(fe->fea, fe->nfe, sizeof(FLEntry), CmpFE);      febuf = (FLEntry *) New(heap,fe->nfe*sizeof(FLEntry));      fe->fea = memcpy(febuf, fe->fea, fe->nfe*sizeof(FLEntry));   }}/* EXPORT-> StoreSEA: move fea array into permanent location */void StoreSEA(FLEntry *fe, MemHeap *heap){   SMEntry *sebuf;   if (fe==NULL)      return;   if (fe->nse==0) {      fe->sea = NULL;   } else {      qsort(fe->sea, fe->nse, sizeof(SMEntry), CmpSE);      sebuf = (SMEntry *) New(heap,fe->nse*sizeof(SMEntry));      fe->sea = memcpy(sebuf, fe->sea, fe->nse*sizeof(SMEntry));   }}/* LoadUnigram: read the unigram part of a file */static int LoadUnigram(Source *src, BackOffLM *lm, int *itran){   char word[256];   int i,tndx,nItem;   float bowt,prob,scale;   LM_Id ndx;   SMEntry *se;   FLEntry *fe;   NameId wdid;   LMProbType ptype;   Boolean has_bowt;   nItem = 0;   scale = lm->gScale*LN10;   ptype = lm->probType;   lm->root.sea = se = lm->se_buff; lm->root.nse = 0;   lm->root.fea = fe = lm->fe_buff; lm->root.nfe = 0;   lm->root.bowt = 0.0;   lm->root.parent = 0;   SyncStr(src,"\\1-grams:");   for (i=1; i<=lm->gInfo[1].nEntry; i++){      READ_FLOAT(src,&prob,FALSE);      if (!GetSrcString(src,word,htkEsc))	 HError(15413,"LoadUnigram: Unable to read unigram %d",i);      SkipWhiteSpace(src);      if (!src->wasNewline) {            /* process backoff weight */	 READ_FLOAT(src,&bowt,FALSE);	 has_bowt = TRUE;      } else {	 has_bowt = FALSE;      }      if (itran!=NULL) {	 if ((wdid = GetNameId(lm->htab,word,FALSE))==NULL) {	    itran[i] = -1;	    continue;	 }	 if ((tndx = LM_INDEX(wdid)) > 0)	    HError(15450,"LoadUnigram: Duplicate unigram %s",word);	 ndx = itran[i] = -tndx;  /* indices pre-assigned as negative */      } else {	 wdid = GetNameId(lm->htab,word,TRUE);	 ndx = i;      }      nItem++;      lm->binMap[ndx] = wdid; /* This is where the wordlist is built */      se->ndx = LM_INDEX(wdid) = ndx;      switch(ptype) {         case LMP_FLOAT :	   se->prob = LOG10_TO_FLT(prob); break;         case LMP_LOG :#ifdef LM_COMPACT	   se->prob = Prob2Shrt(prob); break;#else	   se->prob = prob * scale; break;#endif         default:	   if (prob < 0.0)	      HError(15450,"LoadUnigram: Negative probability (%.4f) for unigram %d",		     prob,i);	   se->prob = prob; break;      }      se++; lm->root.nse++;      if (has_bowt) {         /* process backoff weight */	 fe->ndx = ndx;	 fe->nse = 0; fe->sea = NULL;	 fe->nfe = 0; fe->fea = NULL;	 switch(ptype) {	    case LMP_FLOAT :	      fe->bowt = LOG10_TO_FLT(bowt); break;	    case LMP_LOG :	      fe->bowt = bowt*scale; break;	    default :	      fe->bowt = bowt; break;	 }	 fe++; lm->root.nfe++;      }   }   if (itran!=NULL && nItem!=lm->vocSize) {      /* create dummy entries for unseen unigrams */      for (i=1; i<=lm->vocSize; i++) {	 if ((tndx = LM_INDEX(lm->binMap[i])) > 0)	    continue;	 LM_INDEX(lm->binMap[i]) = -tndx;	 se->ndx = -tndx; se->prob = 0.0;	 se++; lm->root.nse++;      }   }   StoreFEA(&(lm->root),lm->heap);   StoreSEA(&(lm->root),lm->heap);   /* check unigram consistency */   for (se=lm->root.sea, i=0; i<lm->root.nse; i++, se++) {      if (se->ndx!=i+1)	 HError(15450, "LoadUnigram: Mismatched unigram index %d should be %d", se->ndx, i+1);   }   return nItem;}#define TRINDEX(itran,i) (itran==NULL) ? i : itran[i]/* LoadNGram: read n-gram (N>1) from file f */static int LoadNGram(Source *src, int nSize, BackOffLM *lm, int *itran){   LM_Id ndx=0;   NGramInfo *gi;   LMProbType ptype;   Byte fsize, flags;   SMEntry *se=NULL;   FLEntry *feptr=NULL, *fe=NULL;   float prob,bowt,scale;   int i,j,k,num_fe,num_se; /*,n*/   char *s,lnBuf[256],word[256];   Boolean has_bowt, hasOOV, newCTX, isBin=FALSE;   NameId wdid[LM_NSIZE], keyid[LM_NSIZE];   if (nSize==1) {      return LoadUnigram(src,lm,itran);   }   scale = lm->gScale*LN10;   ptype = lm->probType;   gi = lm->gInfo+nSize;   if (gi->fmt==LMF_BINARY || gi->fmt==LMF_TEXT)      isBin = (gi->fmt==LMF_BINARY);   else      HError(15450,"LoadNGram: Unknown LM file format (%d)\n",gi->fmt);   if (trace&T_LOAD) {      printf("Loading %d %d-grams (%s)\n",	     lm->gInfo[nSize].nEntry,nSize,isBin ? "bin":"text");      fflush(stdout);   }   num_fe = num_se = 0;   keyid[0] = NULL; /* Previous context */   sprintf(lnBuf, "\\%d-grams:",nSize); SyncStr(src,lnBuf);   for (i=0; i<lm->gInfo[nSize].nEntry; i++) {      has_bowt = FALSE; hasOOV = FALSE;      if (isBin) {  /* binary model */	 fsize = (Byte) GetCh(src);	 flags = (Byte) GetCh(src);	 READ_FLOAT(src,&prob,TRUE);	 for (j=0; j<nSize; j++) {	    if (flags&INT_LMID) {	       UInt a;	       ReadInt(src,(int *)&a,1,TRUE);	       ndx = (LM_Id) a;	    } else {	       UShort a;	       ReadShort(src,(short *)&a,1,TRUE);	       ndx = (LM_Id) a;	    }	    if (itran!=NULL && itran[ndx]<0) {	       hasOOV = TRUE;	    } else {	       if (itran!=NULL) ndx = itran[ndx];	       if ((ndx > 0) && (ndx <=lm->vocSize))		  wdid[j] = lm->binMap[ndx];	       else		  HError(15450,"LoadNGram: LM index out of bounds (%d)", ndx);	    }	 }	 if (flags&HAS_BOWT) {	    READ_FLOAT(src,&bowt,TRUE);	    has_bowt = TRUE;	 }      } else { 	/* text model */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -