📄 lmodel.c

📁 该压缩包为最新版htk的源代码,htk是现在比较流行的语音处理软件,请有兴趣的朋友下载使用
💻 C
📖 第 1 页 / 共 5 页
字号:
	 READ_FLOAT(src,&prob,FALSE);	 for (j=0; j<nSize; j++) {     /* read n-gram words */	    if (!GetSrcString(src,word,htkEsc))	       HError(15450,"LoadNGram: Unable to read word %d of %d-gram",j,nSize);	    if ((wdid[j] = GetNameId(lm->htab,word,FALSE))==NULL) {	       if (itran==NULL)		  HError(-15450, "LoadNGram: Word %s not in unigrams, skipping n-gram", word);	       hasOOV = TRUE;	    } else {	       ndx = LM_INDEX(wdid[j]);	    }	 } 	 SkipWhiteSpace(src);	 if (!src->wasNewline) {	    READ_FLOAT(src,&bowt,FALSE);	    has_bowt = TRUE;	 }      }      if (hasOOV) continue;      /* See if the context has changed */      for (newCTX=FALSE, j=0; j<nSize-1; j++) {	 if (keyid[j]!=wdid[j]) {	    newCTX=TRUE; break;	 }      }      /* Guaranteed to execute the first time through the loop because the context	 has not been seen before (thus defining fe, se etc) */      if (newCTX) {             /* new n-gram context */	 if (keyid[0]!=NULL) {  /* copy to permanent storage */	    StoreFEA(feptr,lm->heap); num_fe += feptr->nfe;	    StoreSEA(feptr,lm->heap); num_se += feptr->nse;	 }	 for (feptr = &(lm->root), j=0; j<nSize-1; j++) {	    if ((feptr = FindFE(feptr->fea, 0, feptr->nfe, LM_INDEX(wdid[j])))==NULL) {	       for (s=lnBuf,k=0; k<nSize; k++) {		  sprintf(s,"%s[%d] ",wdid[k]->name,LM_INDEX(wdid[k])); s+=strlen(s);	       }	       HError(15420, "LoadNGram: Cannot find component %d of (%d) %d-gram %s",		      j,i,nSize,lnBuf);	    }	    keyid[j] = wdid[j];	 }	 feptr->fea = fe = lm->fe_buff; feptr->nfe = 0;	 feptr->sea = se = lm->se_buff; feptr->nse = 0;      }      se->ndx = ndx;      switch(ptype) {         case LMP_FLOAT :	   se->prob = LOG10_TO_FLT(prob); break;         case LMP_LOG :#ifdef LM_COMPACT	   se->prob = Prob2Shrt(prob); break;#else	   se->prob = prob * scale; break;#endif         default:	   se->prob = prob; break;      }      se++; (feptr->nse)++;      if (has_bowt) {	 /* also store as full entry */	 fe->ndx = ndx;	 fe->nse = 0; fe->sea = NULL;	 fe->nfe = 0; fe->fea = NULL;	 switch(ptype) {	    case LMP_FLOAT :	      fe->bowt = LOG10_TO_FLT(bowt); break;	    case LMP_LOG :	      fe->bowt = bowt*scale; break;	    default :	      fe->bowt = bowt; break;	 }	 fe++; (feptr->nfe)++;      }   }   if (keyid[0]!=NULL) {  /* store the last accumulated */      StoreFEA(feptr,lm->heap); num_fe += feptr->nfe;      StoreSEA(feptr,lm->heap); num_se += feptr->nse;   }   /*      if (isBin) {        // read the last 2 zero bytes   //         ReadShort(src,&ndx,1,TRUE);      }   */   if (trace&T_LOAD) {      printf("  SMEntry: %8d x %2d bytes = %d bytes\n",	     num_se, sizeof(SMEntry), num_se*sizeof(SMEntry));      printf("  FLEntry: %8d x %2d bytes = %d bytes\n",	     num_fe, sizeof(FLEntry), num_fe*sizeof(FLEntry));   }   lm->gInfo[0].nEntry+=num_fe;   return num_se;}/* Create reverse lookup pointers in FLEntry context tree *//* Call with (lm->root, 0) and let it recurse its way down */void CreateReverseLookup(FLEntry *fes){   int i; /* loop counter */   for (i=0; i<fes->nfe; i++) {      fes->fea[i].parent = fes;      CreateReverseLookup(&(fes->fea[i]));   }}/* EXPORT-> ReadHeaderInfo: read header information *//* First parameter is source file, second is LM structure, and   third parameter is first input line or NULL to read from file */void ReadHeaderInfo(Source *src, BackOffLM *lm, char *line1){   float ff;   int i,j,n;   char *s,*s1,*s2=NULL;   DiscountType dt;   BackOffInfo *bo;   DiscountInfo *di;   char lnBuf[MAXSYMLEN],*sbuf;   lm->probType = LMP_FLOAT|LMP_LOG;   for (i=1; i<LM_NSIZE; i++)      lm->gInfo[i].boInfo = NULL;   while(line1 || GetInLine(src, lnBuf)) {      if (line1) {         strcpy(lnBuf, line1);         line1 = NULL; /* Read the rest from the file */      }      if ((s=strstr(lnBuf,"\\data\\"))!=NULL && s==lnBuf) {	 break;                   /* gone past header, so exit */      }      if (strcmp(lnBuf,"COUNTS")==0) {	lm->probType = LMP_COUNT; continue;      }      for (i=1; i<LM_NSIZE; i++) {   /* try each n-gram name in turn */	 if ((s=strstr(lnBuf,nGramName[i]))==NULL || s!=lnBuf)	    continue;	 bo = (BackOffInfo *) New(lm->heap,sizeof(BackOffInfo));	 di = &(bo->dcInfo);	 if (trace&T_LOAD)	    printf("Parsing %s header info\n",nGramName[i]);	 for (dt=DC_LAST,j=0; j<DC_LAST; j++) {	    if (strstr(lnBuf,dcTypeName[j])!=NULL) {	       dt = j; break;	    }	 }	 if (dt==DC_LAST)	    HError(15450,"LoadHeaderInfo: Unable to parse d-type in %s",lnBuf);	 bo->dcType = dt;	 if ((s1=strstr(lnBuf,"cutoff"))==NULL)	    HError(15450,"LoadHeaderInfo: Unable to find 'cutoff' in %s",lnBuf);	 if (sscanf(s1,"cutoff %d",&n)!=1)	    HError(15450,"LoadHeaderInfo: Unable to parse cutoff value in %s",lnBuf);	 bo->cutOff = n; bo->wdThresh = 0.0;	 if (!GetInLine(src,lnBuf))	    HError(15450,"LoadHeaderInfo: EOF reading d-coefs for %s",nGramName[i]);	 switch (dt) {	 case DC_KATZ:	   if ((s1 = strchr(lnBuf,'['))==NULL || (s2 = strchr(lnBuf,']'))==NULL)	     HError(15450,"LoadHeaderInfo: Unable to find array bounds in %s",lnBuf);	   *s2='\0'; sbuf = s2+1;	   di->tgInfo.kRange = n = atoi(s1+1);	   di->tgInfo.coef = (float *) New(lm->heap,(n+1)*sizeof(float));	   for (j=1; j<=n; j++) {	     s1 = strtok((j==1)?sbuf:NULL," \t\r\n:");	     if (s1==NULL)	       HError(15450,"LoadHeaderInfo: Unable to parse coef %d in %s",j,lnBuf);	     di->tgInfo.coef[j]=atof(s1);	   }	   break;	 case DC_ABSOLUTE:	   if ((s1=strstr(lnBuf,"coef:"))==NULL)	     HError(15450,"LoadHeaderInfo: Unable to find 'coef:' in %s",lnBuf);	   if (sscanf(s1,"coef: %f",&ff)!=1)	     HError(15450,"LoadHeaderInfo: Unable to parse float value in %s",s1);	   di->bCoef=ff;	   break;	 default :	   HError(15450,"LoadHeaderInfo: Unsupported LM type (%d)",dt);	   break;	 }	 lm->gInfo[i].boInfo = bo;      }   }}/* EXPORT-> WriteHeaderInfo: write header information */void WriteHeaderInfo(FILE *f, BackOffLM *lm){   int i,j;   BackOffInfo *bo;   DiscountInfo *di;   if (lm->probType==LMP_COUNT)      fprintf(f,"COUNTS\n\n");   for (i=2; i<=lm->nSize; i++) {      if ((bo = lm->gInfo[i].boInfo)==NULL)	 continue;      di = &(bo->dcInfo);      if (bo->wdThresh>0)	 fprintf(f, "%s: method %s, cutoff %d, wdThresh %.3f\n",		 nGramName[i], dcTypeName[bo->dcType], bo->cutOff, bo->wdThresh);      else	 fprintf(f, "%s: method %s, cutoff %d\n",		 nGramName[i], dcTypeName[bo->dcType], bo->cutOff);      switch (bo->dcType) {         case DC_KATZ :	    fprintf(f, "  coef[%d]:", di->tgInfo.kRange);	    for (j=1; j<=di->tgInfo.kRange; j++)	       fprintf(f," %.6f", di->tgInfo.coef[j]);	    fprintf(f,"\n");	    break;	 case DC_ABSOLUTE :	    fprintf(f, "  coef: %.6f\n", di->bCoef);	    break;	 case DC_LINEAR :	 default:            break;      }      fprintf(f, "\n");   }}/* ReadClassProbsHeader: read in word|class probabilities header */static void ReadClassProbsHeader(char *fname, int *nWords, Source *src, BackOffLM *lm){   char   line[MAXSYMLEN];   /* Current input line */   char  *ptr;               /* Temporary pointers */   *nWords = -1;   if (!src->f) {      /* Open file if necessary */      if (InitSource(fname, src, LangModFilter)!=SUCCESS) {         HError(15410, "ReadClassProbsHeader: Unable to open language model word|class file '%s'", fname);      }   }   strcpy(line, "");   GetInLine(src, line);   if (strncmp(line, "Word|Class probabilities", 25)==0) {      lm->classCounts = FALSE;   }   else if (strncmp(line, "Word|Class counts", 17)==0) {      lm->classCounts = TRUE;   }   else {      HError(15450, "ReadClassProbsHeader: Language model word|class file is in unknown format");   }   if (trace & T_LOAD) {      printf("Word|class file uses word %s\n", lm->classCounts?"counts":"probabilities");   }   while (GetInLine(src, line)) {      if (strncmp(line, "Number of classes", 17)==0) {         ptr = strchr(line, ':');         if (!ptr) {            HError(15450, "ReadClassProbsHeader: Corrupt 'Number of classes' line in word|class file");         }         ptr++;         while (*ptr==' ' || *ptr=='\t') ptr++;         if (trace & T_LOAD) {            printf("Number of classes = %d\n", atoi(ptr));         }      }      else if (strncmp(line, "Number of words", 15)==0) {         ptr = strchr(line, ':');         if (!ptr) {            HError(15450, "ReadClassProbsHeader: Corrupt 'Number of words' line in word|class file");         }         ptr++;         while (*ptr==' ' || *ptr=='\t') ptr++;         *nWords = atoi(ptr);         if (trace & T_LOAD) {            printf("Number of words = %d\n", *nWords);         }      }      else if ((strncmp(line, "Word", 4)==0) || (strncmp(line, "Class", 5)==0)) {         break;      }   }   if (feof(src->f)) {      HError(15450, "ReadClassProbsHeader: Word|Class language model file contains no %s", lm->classCounts?"counts":"probabilities");   }   if (*nWords == -1) {      HError(15450, "ReadClassProbsHeader: Failed to find number of words header in word|class file");   }}/* ReadClassCounts: read in word|class counts file */static void ReadClassCounts(Source *src, int nWords, BackOffLM *lm){   char   line[MAXSYMLEN];   /* Current input line */   char  *ptr, *ptr2;        /* Temporary pointers */   int    i;                 /* Loop counter */   WordProb *wordProb;       /* Temporary pointer */   int    loop=1;            /* Array index counter */   NameId nid, nid2;         /* Word ids */   int    class_id=0;        /* Number classes from 0 */   int    floor_count = 0;   /* Number of counts floored */   /* Add labels and wordlist entries for words */   for (i=0; i<nWords; i++) {      ptr = GetInLine(src, line);      if (!ptr || strlen(ptr)==0) {         HError(15450, "ReadClassCounts: Blank line/end of file in word|class language model file");      }      /* Segment line into word, class and count */      /* Don't use strtok() in case a client program is using it */      ptr2 = ptr + strcspn(ptr, " \t"); /* Find end of word */      *ptr2 = '\0';      /* Get name ID */      nid = GetNameId(lm->classH, ptr, TRUE);      /* Find class name */      ptr = ptr2 + 1; /* Pass over NULL */      ptr += strspn(ptr, " \t"); /* Skip whitespace */      ptr2 = ptr + strcspn(ptr, " \t"); /* Find end of class name */      *ptr2 = '\0';      nid2 = GetNameId(lm->htab, ptr, TRUE); /* Get name id of class */      class_id = atoi(ptr+5) - 1; /* assume called CLASSn */ /* GLM */      nid2->ptr = (void*) class_id;      ptr = ptr2 + 1; /* Pass over NULL */      ptr += strspn(ptr, " \t"); /* Skip over whitespace */      lm->word[i] = atoi(ptr); /* Store word count */      if (lm->word[i]<=0) {         floor_count++;         lm->word[i] = 1; /* Force zero counts to 1 in order to avoid 0 probabilities */         if (floor_count==5)            HError(-15450, "ReadClassCounts: too many floored counts to list");         else if (floor_count<5)            HError(-15450, "ReadClassCounts: flooring zero count to one for '%s'", nid->name);      }      /* Create structure storing word|class probability and class of word */      wordProb = New(&gcheap, sizeof(WordProb));      wordProb->class = nid2;      wordProb->prob = 0; /* we haven't calculated this yet */      wordProb->id = i;      nid->ptr = wordProb; /* Point word name id here */      /* Set up binMap equivalent */      lm->classBM[loop] = nid;      LM_INDEX(nid) = -i; /* assign negative indices (copied code) */      loop++;   }   /* Check for left over lines */   while (GetInLine(src, line)) {      if (strlen(line)>0) {         HError(15450, "ReadClassCounts: Extraneous line on end of Word|Class probabilities file\n('%s')", line);      }   }   if (floor_count)      HError(-15450, "ReadClassCounts: a total of %d counts were floored", floor_count);}/* CountClassTotals: calculate class count totals for LM */static void CountClassTotals(BackOffLM *lm){   register int i; /* Loop counter */   int word_id, class_id;   lm->totals = New(&gcheap, lm->vocSize * sizeof(int));   for (i=0; i<lm->vocSize; i++) {      lm->totals[i] = 0;   }   for (i=0; i<(lm->classW); i++) {      word_id = ((WordProb*)(lm->classBM[i+1]->ptr))->id;      if (word_id!=i) HError(15490, "CountClassTotals: Inconsistent word ids found");      class_id = (int)(((WordProb*)(lm->classBM[i+1]->ptr))->class->ptr);      lm->totals[class_id] += lm->word[i];   }}/* CalcWordClassProbs: calculate initial/static word|class probabilities */static void CalcWordClassProbs(BackOffLM *lm){   int i; /* loop counter */   int class_id;   double prob=0;   /* For each word */   for (i=0; i<lm->classW; i++) {      class_id = (int)(((WordProb*)(lm->classBM[i+1]->ptr))->class->ptr);      prob = (((double)(lm->word[i]))) / ((double)(lm->totals[class_id]));      ((WordProb*)(lm->classBM[i+1]->ptr))->prob = LOG_NATURAL(prob);   }}/* ReadClassProbs: read in word|class probabilities file */static void ReadClassProbs(Source *src, int nWords, BackOffLM *lm){   char   line[MAXSYMLEN];   /* Current input line */   char  *ptr, *ptr2;        /* Temporary pointers */   int    i;                 /* Loop counter */   WordProb *wordProb;       /* Temporary pointer */   int loop=1;   NameId nid, nid2;   /* Add labels and wordlist entries for words */   for (i=0; i<nWords; i++) {      ptr = GetInLine(src, line);      if (!ptr || strlen(ptr)==0) {         HError(15450, "ReadClassProbs: Blank line/end of file in word|class language model file");      }      /* Segment line into word, class and log probability */      /* We could use strtok(), but I can't be sure that this isn't being         used elsewhere wrapped around this call, so I won't! */      ptr2 = ptr + strcspn(ptr, " \t"); /* Find end of word */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -