📄 lmodel.c

📁 Hidden Markov Toolkit (HTK) 3.2.1 HTK is a toolkit for use in research into automatic speech recogn
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
		  x = (UShort) feStack[j]->ndx;		  WriteShort(f,(short *)&x,1,TRUE);	       }	       x = (UShort) se->ndx;	       WriteShort(f,(short *)&x,1,TRUE);	    }	    if (flags&HAS_BOWT)	       WriteFloat(f,&bowt,1,TRUE);      /* back-off weight */	 } else {             fprintf(f, "%+.4f",prob);	     fprintf(f, "\t%s",context);	     word = lm->binMap[se->ndx]->name;	     if (htkEsc)		word = ReWriteString(word,NULL,ESCAPE_CHAR);	     fprintf(f, "%s",word);	     if (has_bowt)		fprintf(f, "\t%+.4f",bowt);	     fprintf(f, "\n");	 }	 nItem++;      }   }   return nItem;}/* SaveNGram: write LM in to file f */static int SaveNGram(FILE *f, int G, BackOffLM *lm){   int total;   Byte fsize;   FLEntry *feStack[LM_NSIZE];   Boolean useIntID;   if (lm->vocSize > USHRT_MAX) {      if (sizeof(LM_Id) <= sizeof(UShort))	 HError(15445,"SaveNGram: vocSize = %d but using %d-byte IDs",		lm->vocSize, sizeof(LM_Id));      useIntID = TRUE;   } else {      useIntID = defIntID;   }   fprintf(f, "\n\\%d-grams:\n", G);   feStack[0] = &(lm->root);   total = WriteNGram(f,lm,feStack,1,G,useIntID);   if (lm->gInfo[G].fmt==LMF_BINARY) {       /* write out 2 zero bytes */      fsize = 0;      fwrite(&fsize, sizeof(unsigned char), 1, f);      fwrite(&fsize, sizeof(unsigned char), 1, f);   }   if (trace&T_SAVE)      printf("Wrote %d %d-grams\n", total, G);   return total;}/* SaveLangModel: save language model lm to fn */void SaveLangModel(char *lmFn, BackOffLM *lm){   char c=' ';   int i,n;   FILE *f;   NGramInfo *gi;   Boolean isPipe,isUltra;#ifdef HTK_CRYPT   if (lm->encrypt) {      TMP_OPEN(f,lmFn,HError(15411,"SaveLangModel: Cannot create lm file %s",lmFn));   }   else#endif   if ((f = FOpen(lmFn, LangModOFilter, &isPipe))==NULL)      HError(15411,"SaveLangModel: Unable to open output file %s",lmFn);   WriteHeaderInfo(f,lm);   fprintf(f, "\\data\\\n");   isUltra = FALSE;   for (gi=lm->gInfo+1,i=1; i<=lm->nSize; i++,gi++) {      switch (gi->fmt) {	 case LMF_TEXT:   c = '='; break;	 case LMF_BINARY: c = '~'; break;	 case LMF_ULTRA:  c = '#'; isUltra = TRUE; break;	 default:	    HError(15490,"SaveLangModel: Unknown LM file format (%d) for %d-gram",gi->fmt,i);      }      fprintf(f, "ngram %d%c%d\n",i,c,gi->nEntry);   }   if (isUltra) {#ifdef ULTRA_LM      ultraKey[KEY_LENGTH-1] = (vaxOrder && natWriteOrder) ? 1 : 0;      fprintf(f,"KEY: ");      for (i=0; i<KEY_LENGTH; i++) fprintf(f,"%02x ",ultraKey[i]);      fprintf(f,"\n");      SaveNGram(f,1,lm);      SaveUltraNGrams(f,lm);#else      HError(15490,"SaveLangModel: Ultra format LMs not supported");#endif   } else {      for (i=1; i<=lm->nSize; i++) {	 if ((n=SaveNGram(f,i,lm))!=lm->gInfo[i].nEntry) {	    HError(-15490,"SaveLangModel: %d-gram nEntry = %d, actual saved %d",		   i,lm->gInfo[i].nEntry,n);            lm->gInfo[i].nEntry = n;         }      }   }   fprintf(f, "\n\\end\\\n");#ifdef HTK_CRYPT   if (lm->encrypt) {      FILE *crf;      TMP_REWIND(f);      if ((crf = FOpen(lmFn,LangModOFilter,&isPipe)) == NULL) {	 TMP_CLOSE(f,lmFn);	 HError(15411,"SaveLangModel: Cannot create LM file %s",lmFn);      }      EncryptFile(lmFn,crf,f);      FClose(crf,isPipe);      TMP_CLOSE(f,lmFn);   }   else#endif   FClose(f,isPipe);}/*---------------------- N-gram access ---------------------- *//* EXPORT-> GetNGramProb: generic LM access V2 */float GetNGramProb(BackOffLM *lm, NameId *words, int nSize){   int i;   float prob;   SMEntry *se;   FLEntry *fe;   AccessInfo *acs;   LMProbType ptype;   char *s, sbuf[256];   static int rLev = -1;   float prob_mult = 0.0;   /* NGram probability lookup works like this:      1) We see if we're looking for a unigram and if so search for an         appropriate leaf SMEntry at the root level. If we don't find	 one then we must abort with an error at this point.      2) For other lengths we search for the path down the tree to the         FLEntry for the given history. If we don't find a full history	 path we reduce the context and call ourselves recursively.      3) If we found the context then we look at the SMEntry elements         at the FLEntry node to see if we can find our word with the	 given history. If we can then we return the stored probability	 otherwise we recursively call ourselves again with a reduced	 history, multiplying by the back-off weight associated with	 the given history (at the FLEntry node) when we return.   */   /* If we're using a class-based language model then we still get passed      a word history which must be converted into a class history */   if (lm->classLM) {      /* Retrieve word|class probability for word we want to predict */      prob_mult = ((WordProb*)(words[nSize-1]->ptr))->prob;      if (trace&T_PROB) {         if (lm->probType & LMP_FLOAT) { /* this never happens in practice */            printf("<w|c mult=%5.2f> ", UNLOG_NATURAL(prob_mult));         }         else {            printf("<w|c mult=%5.2f> ", prob_mult);         }      }      /* Convert word N-gram into class N-gram */      for (i=0; i<nSize; i++) {	 words[i] = ((WordProb*)(words[i]->ptr))->class;      }   }   rLev++;   ptype = lm->probType;   if (nSize > lm->nSize) {      words += nSize-lm->nSize; nSize = lm->nSize;   }   acs = lm->gInfo[nSize].aInfo; acs->count++;   if (trace&T_PROB) {      printf("[ ");      printf("(%s",words[nSize-1]->name);      if (nSize > 1) {	 printf(" |");	 for(i=0; i<nSize-1; i++) printf(" %s",words[i]->name);      }      printf(") ");   }   if (nSize==1) {  /* lookup unigram separately */      if ((se = FindSE(lm->root.sea,0,lm->root.nse,LM_INDEX(words[0])))==NULL)	 HError(15490,"GetNGramProb: Unable to find %s in unigrams",words[0]->name);#ifdef LM_COMPACT      prob = Shrt2Prob(se->prob) * lm->gScale;#else      prob = se->prob;#endif      if (trace&T_PROB)	 printf("exact, ");   } else {         /* generic n-gram lookup, n>1 */      for (fe=&(lm->root), i=0; i<nSize-1; i++) {	 if ((fe=FindFE(fe->fea, 0, fe->nfe, LM_INDEX(words[i])))==NULL)	    break;      }      if ((fe == NULL) || (fe->nse == 0)) {	 if (lm->classLM) {            lm->classLM = FALSE;            prob = GetNGramProb(lm,words+1,nSize-1);            lm->classLM = TRUE;	 }	 else prob = GetNGramProb(lm,words+1,nSize-1);	 if (trace&T_PROB)	    printf("replaced, ");	 acs->nmiss++;	 if ((trace&T_TOP) &&  (fe != NULL) && (fe->nse == 0)) {	    for (s = sbuf, i=0; i<nSize-1; i++) {	       sprintf(s,"%s ",words[i]->name); s+=strlen(s);	    }	    HError(-15492, "GetNGramProb: FLEntry.nse==0; original ARPA LM?\n%s",sbuf);	 }      } else {	 if ((se = FindSE(fe->sea, 0, fe->nse, LM_INDEX(words[nSize-1])))!=NULL) {#ifdef LM_COMPACT	    prob = Shrt2Prob(se->prob) * lm->gScale;#else	    prob = se->prob;#endif	    if (trace&T_PROB)	       printf("exact, ");	    acs->nhits++;	 } else {	    if (lm->classLM) {               lm->classLM = FALSE;               prob = GetNGramProb(lm,words+1,nSize-1);               lm->classLM = TRUE;	    }	    else prob = GetNGramProb(lm,words+1,nSize-1);	    if (ptype==LMP_FLOAT)	       prob *= fe->bowt;	    else	       prob += fe->bowt;	    if (trace&T_PROB)	       printf("backed-off %.4f, ",fe->bowt);	    acs->nboff++;	 }      }   }   if (lm->classLM)   {      if (lm->probType & LMP_FLOAT) {         /* This looks nasty but in fact we never execute this */         prob *= UNLOG_NATURAL(prob_mult);      }      else {         prob += prob_mult;      }   }   acs->prob += prob; acs->prob2 += prob*prob;   if (trace&T_PROB)      printf("prob %.4f ]%s",prob,(rLev==0) ? "\n" : " ");   rLev--;   return prob;}/* EXPORT-> LMTrans: calls GetNGramProb, but instead of taking a full   n-gram of context we take a pointer to a context and a single word;   we also return a langage model context state */LogFloat LMTrans2(LModel *LM, LMState src, LabId word, LMState *dest){   NameId ngram[LM_NSIZE], ngramRev[LM_NSIZE];   int nSize;   float prob;   NameId nid;   LogFloat prob_mult = 0.0;   FLEntry *context, *fe;   SMEntry *se;   BackOffLM *lm;   float bo_weight;   LMProbType ptype;   int i, index;   int nShorten;  /* Amount to shorten n-gram by when searching for prob */   lm = LM->data.hlmModel;   ptype = lm->probType;   if (src) {      context = (FLEntry *) src;   }   else {      context = &(lm->root); /* No context yet */   }   /* Convert word text to NameId */   if (lm->classLM) { /* class model */      nid = GetNameId(lm->classH, word->name, FALSE);      if (!nid)         HError(15499, "LMTrans: Attempt to predict token '%s' which is not in vocabulary", word);      /* Find word-given-class probability and convert to a class */      prob_mult = ((WordProb*)(nid->ptr))->prob;      if (trace&T_PROB) {         if (ptype & LMP_FLOAT) { /* this first never happens in practice */            printf("<w|c mult=%5.2f> ", UNLOG_NATURAL(prob_mult));         }         else {            printf("<w|c mult=%5.2f> ", prob_mult);         }      }   }   else { /* not a class model */      nid = GetNameId(lm->htab, word->name, FALSE);      if (!nid)         HError(15499, "LMTrans: Attempt to predict token '%s' which is not in vocabulary", word);   }   /* We need to reconstruct the context later so do it now incase we need to back off */   fe = context;   nSize = 0;   while (fe && fe!=&(lm->root) && nSize<LM_NSIZE) {      ngramRev[nSize] = lm->binMap[fe->ndx];      fe = fe->parent;      nSize++;   }   if (nSize>=LM_NSIZE)      HError(15499, "LMTrans: Context rebuilt to longer than compiled ngram size limit of %d", LM_NSIZE);   /* And now we know the length we can reverse it */   for (i=0; i<nSize; i++) ngram[i] = ngramRev[nSize-(i+1)];   ngram[nSize] = nid;   nSize++;   /* For debugging purposes, print out the full ngram */   /*printf("nsize=%d  ", nSize);     for (i=0; i<nSize; i++) printf("%s ", ngram[i]->name); printf("\n");*/   /* Search for probability */   if (ptype & LMP_FLOAT)      bo_weight = 1;   else      bo_weight = 0;   se = FindSE(context->sea, 0, context->nse, LM_INDEX(nid));   nShorten = 0;   fe = context;   while (!se) {      /* Multiply BO weight and shorten context */      if (ptype & LMP_FLOAT)         bo_weight *= fe->bowt;      else         bo_weight += fe->bowt;      nShorten++;      if (nShorten==nSize) { /* Unigram probability */         se = FindSE(lm->root.sea, 0, lm->root.nse, LM_INDEX(nid));         if (!se)            HError(15490, "LMTrans: Unable to find %s in unigrams", nid->name);      }      else { /* n>1 */         fe = &(lm->root);         for (i=nShorten; i<nSize-1; i++) {            fe = FindFE(fe->fea, 0, fe->nfe, LM_INDEX(ngram[i]));            if (!fe) HError(15491, "LMTrans: Unable to find shortened context in LM");         }         se = FindSE(fe->sea, 0, fe->nse, LM_INDEX(ngram[i]));      }   }#ifdef LM_COMPACT   prob = Shrt2Prob(se->prob) * lm->gScale;#else   prob = se->prob;#endif   if (ptype & LMP_FLOAT) {      prob = prob * bo_weight;   }   else {      prob = prob + bo_weight;   }   /* Now look for FLEntry for new context for any further following word */   /* Decide from which point in the context we start searching */   if (nSize == lm->nSize)      index = 1;   else      index = 0;   do {      fe = &(lm->root);      for (i=index; i<nSize; i++) {         fe = FindFE(fe->fea, 0, fe->nfe, LM_INDEX(ngram[i]));         if (!fe) {            /* Context not found, so shorten and retry */            index++;            break;         }      }   }   while (!fe); /* Works because if no context then we don't execute inner loop and fe=&(lm->root) */   *dest = fe;   if (lm->classLM) {      if (lm->probType & LMP_FLOAT) {         /* This looks nasty but in fact it never executes in practice */         prob *= UNLOG_NATURAL(prob_mult);      }      else {         prob += prob_mult;      }   }   return prob;}/* EXPORT-> GetNGramAddress: same as GetNGramProb but returns address   of structure. This is used to provide a unique id for a particular   context. This is used with Lattice Toolkit.   The final word in words[] is a dummy entry which is never   used.  Its value is undefined and should not be interpreted.   (ie. words[nSize-1]).  It works like this in order to parallel    GetNGramProb() */void *GetNGramAddress(BackOffLM *lm, NameId *words, int nSize){   int i;   FLEntry *fe;   char *s, sbuf[256];   static int r
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -