📄 lmodel.c

📁 隐马尔科夫模型工具箱
💻 C
📖 第 1 页 / 共 5 页
字号:
      *ptr2 = '\0';      /* Get name ID */      nid = GetNameId(lm->classH, ptr, TRUE);      /* Find class name */      ptr = ptr2 + 1; /* Pass over NULL */      ptr += strspn(ptr, " \t"); /* Skip whitespace */      ptr2 = ptr + strcspn(ptr, " \t"); /* Find end of class name */      *ptr2 = '\0';      nid2 = GetNameId(lm->htab, ptr, TRUE); /* Get name id of class */      ptr = ptr2 + 1; /* Pass over NULL */      ptr += strspn(ptr, " \t"); /* Skip over whitespace */      /* Create structure storing word|class probability and class of word */      wordProb = New(&gcheap, sizeof(WordProb));      wordProb->class = nid2;      wordProb->prob = atof(ptr);      wordProb->id = -1;      nid->ptr = wordProb; /* Point word name id here */      /* Set up binMap equivalent */      lm->classBM[loop] = nid;      LM_INDEX(nid) = -i; /* assign negative indices (copied code) */      loop++;   }   /* Check for left over lines */   i=0;   while (GetInLine(src, line)) {      if (strlen(line)>0) {         if (i>10) {            HError(-15451, "ReadClassProbs: Further extraneous lines not shown");            break;         }         HError(-15451, "ReadClassProbs: Extraneous line on end of Word|Class probabilities file\n('%s')", line);         i++;      }   }}/* EXPORT-> LoadLangModel: read N-gram language model from fn */BackOffLM *LoadLangModel(char *fn, WordMap *wl, float gramScale,			 LMProbType tgtPType, MemHeap *heap){   Source src;   NGramInfo *gi;   BackOffLM *lm;   int *itran,nSize,i,n;   char c,sfmt[256];   char lnBuf[MAXSYMLEN];   Boolean isUltra;   char *first_line;         /* First line of input file */   char wc_fname[MAXSYMLEN]; /* Filename of word|class probs */   Source wcSrc;             /* word|class probs/counts file */   int nWords;               /* Number of words in total over all classes */   char *ptr;   if ((tgtPType&LMP_FLOAT) && (tgtPType&LMP_LOG))      HError(15430,"LoadLangModel: Incompatible probability kind requested: %d",tgtPType);   if (InitSource(fn,&src,LangModFilter)!=SUCCESS)   /* Open LM file */      HError(15410,"Unable to open language model file");   if (trace&T_LOAD) {      printf("Loading language model from %s\n", fn); fflush(stdout);   }   lm = (BackOffLM *) New(heap,sizeof(BackOffLM) * 2);   lm->heap = heap;   lm->htab = CreateHashTable(11731,"Back-off LM hash table");   lm->gScale = gramScale;   lm->fe_buff = NULL;   lm->se_buff = NULL;   lm->binMap  = NULL;   lm->classH = NULL;   lm->classLM = FALSE; /* default to not a class-based LM */   lm->classBM = NULL;   lm->classW = 0;#ifdef HTK_CRYPT   lm->encrypt = (src.crypt!=NULL);#endif   for (gi=lm->gInfo, i=1; i<LM_NSIZE; i++,gi++) {      gi->nEntry = 0; gi->fmt = LMF_OTHER;      gi->aInfo = NULL; gi->boInfo = NULL;   }   /* Have a look at the input file to see if it's a word|class count/probability      file. If so it will link to the 'real' class language model, so load in      these probabilities, and then continue to load the class gram counts from      a standard language model as if it was the only original input. */   /* Read first line from input LM file */   GetInLine(&src, lnBuf);   /* See if it's a multi-file class-based LM */   if (strncmp(lnBuf, "Class-based LM", 14)==0) {      /* Class-based LM */      if (trace & T_LOAD) {         printf("Loading a multi-file class-based language model\n");      }      /* Read filename of word|class probs/counts */      GetInLine(&src, lnBuf);      ptr = strchr(lnBuf, ':');      if (!ptr) HError(15450, "LoadLangModel: Class language model file is in unknown format");      ptr++;      ptr += strspn(ptr, " \t");      strcpy(wc_fname, ptr);      /* Read filename of class|class bigrams */      GetInLine(&src, lnBuf);      ptr = strchr(lnBuf, ':');      if (!ptr) HError(15450, "LoadLangModel: Class language model file is in unknown format");      ptr++;      ptr += strspn(ptr, " \t");      /* NOTE: ptr content is used later on in this function to load in the class n-grams */      /* Close input file (ignore anything left in the file) */      CloseSource(&src);      /* Load in word|class counts/probabilities file header */      wcSrc.f = NULL; /* No existing file */      ReadClassProbsHeader(wc_fname, &nWords, &wcSrc, lm);      /* This sets lm->classCounts if it reads the appropriate header; otherwise probabilities */      /* Allocate hash table for words */      lm->classH = CreateHashTable((nWords/3)+1, "LM word/classes map");      /* Allocate space for vocabulary map for words */      lm->classBM = (NameId *) New(lm->heap, nWords*sizeof(NameId));      lm->classBM--;  /* indexed from 1 (this is to make it work the same way as binMap) */      /* This is really nasty so be careful if modifying code using classBM (or binMap) */      /* This is a class-based LM (flag is toggled when backing off in GetNGramProb) */      lm->classLM = TRUE;      /* Store number of vocab words */      lm->classW = nWords;      /* We can either load probabilities or counts; counts require extra storage */      if (lm->classCounts) {         int j;         /* Allocate word count storage space (totals allocated once we know #classes) */         lm->word = New(&gcheap, nWords * sizeof(int));         for (j=0; j<nWords; j++) {            lm->word[j] = 0;         }      }      /* Open class|class n-grams */      if (InitSource(ptr, &src, LangModFilter)!=SUCCESS) /* ptr is n-gram file name */         HError(15410, "LoadLangModel: Unable to open class|class n-gram language model file");      if (trace&T_LOAD) {         printf("Loading class n-grams from %s\n", ptr);         fflush(stdout);      }      first_line = NULL; /* Read first line from class n-gram LM */   }   /* See if it's a single-file class LM */   else if (strncmp(lnBuf, "CLASS MODEL", 11)==0) {      if (trace & T_LOAD) {         printf("Loading a class-based language model\n");      }      /* Load in word|class counts/probabilities header */      wcSrc = src; /* Copy structure */      ReadClassProbsHeader("", &nWords, &wcSrc, lm);      /* This sets lm->classCounts if it reads the appropriate header; otherwise probabilities */      /* Allocate hash table for words */      lm->classH = CreateHashTable((nWords/3)+1, "LM word/classes map");      /* Allocate space for vocabulary map for words */      lm->classBM = (NameId *) New(lm->heap, nWords*sizeof(NameId));      lm->classBM--;  /* indexed from 1 (this is to make it work the same way as binMap) */      /* This is really nasty so be careful if modifying code using classBM (or binMap) */      /* This is a class-based LM (flag is toggled when backing off in GetNGramProb) */      lm->classLM = TRUE;      /* Store number of vocab words */      lm->classW = nWords;      /* We can either load probabilities or counts; counts require extra storage */      if (lm->classCounts) {         int j;         /* Allocate word count storage space (totals allocated once we know #classes) */         lm->word = New(&gcheap, nWords * sizeof(int));         for (j=0; j<nWords; j++) {            lm->word[j] = 0;         }      }      /* Open class|class n-grams */      if (trace&T_LOAD) {         printf("Reading class n-gram counts\n");         fflush(stdout);      }      first_line = NULL; /* Read first line from current open file */   }   else {      first_line = lnBuf; /* We've already read the first line */   }   ReadHeaderInfo(&src, lm, first_line);  /* First line of input is passed (or NULL) */   if ((lm->probType&tgtPType)==0)      HError(15430,"LoadLangModel: Unable to convert %d to %d pkind",	     lm->probType,tgtPType);   lm->probType &= tgtPType;   isUltra = FALSE;   for (gi=lm->gInfo+1, nSize=1; nSize<LM_NSIZE; nSize++,gi++) {      sprintf(sfmt, "ngram %d%%c%%d", nSize);      if (GetInLine(&src,lnBuf)==NULL)	  HError(15450,"LoadLangModel: EOF whilst parsing n-gram info");      if (sscanf(lnBuf, sfmt, &c, &n)==2) {	 if (trace&T_LOAD)	    printf("%s\n", lnBuf);	 gi->nEntry = n;	 switch (c) {	    case '=': gi->fmt = LMF_TEXT;   break;	    case '~': gi->fmt = LMF_BINARY; break;	    case '#': gi->fmt = LMF_ULTRA;  isUltra = TRUE; break;            default :	       HError(15450,"LoadLangModel: Unknown LM file format (%s)",lnBuf);	 }      } else	 break;   }   if (--nSize < 1)      HError(15450, "LoadLangModel: Unable to identify file %s", fn);   lm->nSize = nSize;   /* initialise vocabulary size and lookup table */   lm->vocSize = (wl==NULL) ? lm->gInfo[1].nEntry : wl->used;   lm->binMap = (NameId *) New(lm->heap,(lm->vocSize)*sizeof(NameId));   lm->binMap--;  /* indexed from 1 - beware if altering the code! This is really nasty! */   if (wl!=NULL) {      NameId wdid;      if (isUltra)	 HError(15440,"LoadLangModel: Cannot prune models in ultra format");      itran = (int *) New(&gstack,(lm->gInfo[1].nEntry+1)*sizeof(int));      for (i=1; i<=lm->vocSize; i++) {	 wdid = GetNameId(lm->htab,wl->id[i-1]->name,TRUE);	 lm->binMap[i] = wdid; LM_INDEX(wdid) = -i; /* assign negative indices */      }   } else {      itran = NULL;      for (i=1; i<=(lm->vocSize); i++) lm->binMap[i]=NULL;   }   if ((lm->vocSize > USHRT_MAX) && (sizeof(LM_Id)==sizeof(UShort)))      HError(15445,"LoadLangModel: Unable to load %d unigrams using %d-byte IDs",	     lm->vocSize,sizeof(LM_Id));   /* initialise auxilliary structures */   lm->lmvec = (float *) New(lm->heap,(lm->vocSize )*sizeof(float));   lm->lmvec--;   /* indexed from 1 (hmmmmm) */   lm->fe_buff = (FLEntry *) New(lm->heap,(lm->vocSize )*sizeof(FLEntry));   lm->se_buff = (SMEntry *) New(lm->heap,(lm->vocSize )*sizeof(SMEntry));   if (isUltra) {                        /* ultra file format */#ifdef ULTRA_LM      unsigned short key[KEY_LENGTH];      if (strstr(lnBuf,"KEY: ")==NULL)	 HError(15450,"LoadLangModel: Unable to find KEY (%s)",lnBuf);      ultraKey[KEY_LENGTH-1] = (vaxOrder && natReadOrder) ? 1 : 0;      for (strtok(lnBuf," "),i=0; i<KEY_LENGTH; i++) {	 if ((s=strtok(NULL," "))==NULL)	    HError(15450,"LoadLangModel: Unable to read key[%d] (%s)",i,lnBuf);	 key[i] = strtol(s,(char **)NULL,16);	 if (key[i]!=ultraKey[i])	    HError(15450,"LoadLangModel: key[%d] mismatch %02x - should be %02x\n",		   i, key[i], ultraKey[i]);      }      LoadNGram(&src,1,lm,NULL);      LoadUltraNGrams(&src,lm);#else      HError(15490,"LoadLangModel: Ultra format LMs not supported");#endif   } else {                              /* text or binary file format */      for (i=1; i<=nSize; i++)	 lm->gInfo[i].nEntry = LoadNGram(&src,i,lm,itran);   }   if (itran!=NULL) Dispose(&gstack,itran);   SyncStr(&src,"\\end\\");   if (wcSrc.f != src.f) CloseSource(&src);   for (i=1; i<lm->nSize; i++) {      if (lm->gInfo[i].nEntry==0) {	 HError(-15460,"LoadLangModel: Model order changed from %d-gram to %d-gram",		lm->nSize,i-1);	 lm->nSize=i-1; break;      }   }   /* Build reverse look-up for use when recreating context from an FLEntry pointer */   CreateReverseLookup(&(lm->root));   if (lm->classLM) {      if (lm->classCounts) {         /* Load in given word|class count file(s) */         if (trace & T_LOAD)            printf("Loading word-in-class counts\n");          ReadClassCounts(&wcSrc, nWords, lm);          /* Allocate space for and count class totals for each LM */          CountClassTotals(lm);          /* Calculate static/initial word|class probabilities */          CalcWordClassProbs(lm);      }      else {         if (trace & T_LOAD)            printf("Loading word-in-class probabilities\n");         /* Load in word|class probabilities file */         ReadClassProbs(&wcSrc, nWords, lm);      }      CloseSource(&wcSrc);   }   if (trace & T_LOAD)      printf("Language model import complete (%d words; %s model)\n",             lm->classW, lm->classLM?"class":"word");   return lm;}/*------------------------- LM saving -------------------------*//* WriteNGram: recursive write routine */static int WriteNGram(FILE *f, BackOffLM *lm, FLEntry **feStack,		      int g, int nSize, Boolean intId){   NGramInfo *gi;   int i,j,ndx,nItem;   SMEntry *se;   FLEntry *fe,*topFE;   Byte fsize,flags;   float prob,bowt,iScale;   Boolean has_bowt, isBin=FALSE;   char *s, *word, context[MAXSYMLEN];   LMProbType ptype;   nItem = 0; iScale = 1.0/(lm->gScale*LN10);   ptype = lm->probType;   if (g < nSize) {      topFE  = feStack[g-1];      for (fe=topFE->fea, i=0; i<topFE->nfe; i++, fe++) {	 feStack[g] = fe;	 nItem += WriteNGram(f,lm,feStack,g+1,nSize,intId);      }   } else {      gi = lm->gInfo+nSize;      if (gi->fmt==LMF_BINARY || gi->fmt==LMF_TEXT)	 isBin = (gi->fmt==LMF_BINARY);      else	 HError(15490,"LoadNGram: Unknown LM file format (%d)\n",gi->fmt);      for (*context = '\0',s = context,j=1; j<nSize; j++) {	 ndx = feStack[j]->ndx;	 if ((ndx < 1) || (ndx > lm->vocSize))	    HError(15490,"WriteNGram: Component %d of %d-gram, FE index (%d)",		   j,nSize,ndx);	 word = lm->binMap[ndx]->name;	 if (htkEsc)	    word = ReWriteString(word,NULL,ESCAPE_CHAR);	 sprintf(s,"%s ",word); s+=strlen(s);      }      topFE = feStack[nSize-1];      for (se = topFE->sea,i=0; i<topFE->nse; i++, se++)  {	 if ((se->ndx < 1) || (se->ndx > lm->vocSize)) {	    HError(15490,"WriteNGram: Invalid SE index (%d)",se->ndx);	 }	 switch (ptype) {	    case LMP_FLOAT :	       prob = FLT_TO_LOG10(se->prob); break;	    case LMP_LOG :#ifdef LM_COMPACT	       prob = Shrt2Prob(se->prob); break;#else	       prob = se->prob * iScale; break;#endif	    default:	       prob = se->prob; break;	 }	 if ((nSize < lm->nSize)   &&	     (topFE->nfe>0)        &&	     (fe = FindFE(topFE->fea,0,topFE->nfe,se->ndx))!=NULL) {            /*            if (fe->nse>0) {*/               has_bowt = TRUE;               switch (ptype) {	       case LMP_FLOAT :	          bowt = FLT_TO_LOG10(fe->bowt); break;	       case LMP_LOG :	          bowt = fe->bowt * iScale; break;	       default:	          bowt = fe->bowt; break;                  /* }*/               }	 } else {	    has_bowt = FALSE;	 }	 if (isBin) {	    flags = 0; fsize = sizeof(float);	    if (has_bowt) {	       flags |= HAS_BOWT; fsize += sizeof(float);	    }	    if (intId) {	       fsize += nSize*sizeof(UInt);	       flags |= INT_LMID;	    } else {	       fsize += nSize*sizeof(UShort);	    }	    fwrite(&fsize, sizeof(Byte),1,f);  /* size field */	    fwrite(&flags, sizeof(Byte),1,f);  /* flags field */	    WriteFloat(f,&prob,1,TRUE);        /* probability */	    if (flags&INT_LMID) {	       UInt x;	       for (j=1; j<nSize; j++) {		  x = (UInt) feStack[j]->ndx;		  WriteInt(f,(int *)&x,1,TRUE);	       }	       x = (UInt) se->ndx;	       WriteInt(f,(int *)&x,1,TRUE);	    } else {	       UShort x;	       for (j=1; j<nSize; j++) {
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -