📄 lm.c

📁 CMU大名鼎鼎的SPHINX－3大词汇量连续语音识别系统
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* ==================================================================== * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer.  * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * This work was supported in part by funding from the Defense Advanced  * Research Projects Agency and the National Science Foundation of the  * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * lm.c -- Disk-based backoff word trigram LM module. * * ********************************************** * CMU ARPA Speech Project * * Copyright (c) 1997 Carnegie Mellon University. * ALL RIGHTS RESERVED. * ********************************************** *  * HISTORY *  * 20.Apr.2001  RAH (rhoughton@mediasite.com, ricky.houghton@cs.cmu.edu) *              Adding lm_free() to free allocated memory *  * 30-Dec-2000  Rita Singh (rsingh@cs.cmu.edu) at Carnegie Mellon University *		Removed language weight application to wip. To maintain *		comparability between s3decode and current decoder. Does *		not affect decoding performance. * * 23-Feb-2000	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Bugfix: Applied language weight to word insertion penalty. *  * 24-Jun-97	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Added lm_t.access_type; made lm_wid externally visible. *  * 24-Jun-97	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Added lm_t.log_bg_seg_sz and lm_t.bg_seg_sz. *  * 13-Feb-97	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University. * 		Creating from original S3 version. */#include "lm.h"#include "bio.h"#include "logs3.h"const char *darpa_hdr = "Darpa Trigram LM";/*ARCHAN, 20041112: NOP, NO STATIC VARIABLES! */static lm_t *lm_read_dump (char *file, float64 lw, float64 wip, float64 uw,int32 n_lmclass_used,lmclass_t *lmclass,int32 dict_size);int32 lm_get_classid (lm_t *model, char *name){    int32 i;        if (! model->lmclass)	return BAD_LMCLASSID;        for (i = 0; i < model->n_lmclass; i++) {	if (strcmp (lmclass_getname(model->lmclass[i]), name) == 0)	    return (i + LM_CLASSID_BASE);    }    return BAD_LMCLASSID;}int32 lm_delete (lm_t *lm,lmset_t *lmset){#if 0    int32 i;    tginfo_t *tginfo, *next_tginfo;        if (lm->fp)	fclose (lm->fp);        free (lm->ug);    if (lm->n_bg > 0) {	if (lm->bg)		/* Memory-based; free all bg */	    free (lm->bg);	else {		/* Disk-based; free in-memory bg */	  for (i = 0; i < lm->n_ug; i++)		if (lm->membg[i].bg)		    free (lm->membg[i].bg);	    free (lm->membg);	}	free (lm->bgprob);    }        if (lm->n_tg > 0) {	if (lm->tg)		/* Memory-based; free all tg */	    free (lm->tg);	for (i = 0; i < lm->n_ug; i++) {	/* Free cached tg access info */	    for (tginfo = lm->tginfo[i]; tginfo; tginfo = next_tginfo) {		next_tginfo = tginfo->next;		if ((! lm->tg) && tginfo->tg)	/* Disk-based; free in-memory tg */		    free (tginfo->tg);		free (tginfo);	    }	}	free (lm->tginfo);	free (lm->tgprob);	free (lm->tgbowt);	free (lm->tg_segbase);    }        for (i = 0; i < lm->n_ug; i++)	free (lm->wordstr[i]);    free (lm->wordstr);        free (lm);    free (lmset[i].name);        for (; i < n_lm-1; i++)	lmset[i] = lmset[i+1];    --n_lm;    E_INFO("LM(\"%s\") deleted\n", name);#endif            E_INFO("Warning, lm_delete is currently empty, no memory is deleted\n");    return (0);}/* Apply unigram weight; should be part of LM creation, but... */static void lm_uw (lm_t *lm, float64 uw){    int32 i, loguw, loguw_, loguniform, p1, p2;    /* Interpolate unigram probs with uniform PDF, with weight uw */    loguw = logs3 (uw);    loguw_ = logs3 (1.0 - uw);    loguniform = logs3 (1.0/(lm->n_ug-1));	/* Skipping S3_START_WORD */        for (i = 0; i < lm->n_ug; i++) {	if (strcmp (lm->wordstr[i], S3_START_WORD) != 0) {	    p1 = lm->ug[i].prob.l + loguw;	    p2 = loguniform + loguw_;	    lm->ug[i].prob.l = logs3_add (p1, p2);	}    }}static void lm2logs3 (lm_t *lm, float64 uw){    int32 i;    for (i = 0; i < lm->n_ug; i++) {	lm->ug[i].prob.l = log10_to_logs3 (lm->ug[i].prob.f);	lm->ug[i].bowt.l = log10_to_logs3 (lm->ug[i].bowt.f);    }        lm_uw (lm, uw);        for (i = 0; i < lm->n_bgprob; i++)	lm->bgprob[i].l = log10_to_logs3 (lm->bgprob[i].f);    if (lm->n_tg > 0) {	for (i = 0; i < lm->n_tgprob; i++)	    lm->tgprob[i].l = log10_to_logs3 (lm->tgprob[i].f);	for (i = 0; i < lm->n_tgbowt; i++)	    lm->tgbowt[i].l = log10_to_logs3 (lm->tgbowt[i].f);    }}void lm_set_param (lm_t *lm, float64 lw, float64 wip){    int32 i, iwip;    float64 f;        if (lw <= 0.0)	E_FATAL("lw = %e\n", lw);    if (wip <= 0.0)	E_FATAL("wip = %e\n", wip);#if 0 /* No lang weight on wip */    iwip = logs3(wip) * lw; #endif    iwip = logs3(wip);        f = lw / lm->lw;        for (i = 0; i < lm->n_ug; i++) {	lm->ug[i].prob.l = (int32)((lm->ug[i].prob.l - lm->wip) * f) + iwip;	lm->ug[i].bowt.l = (int32)(lm->ug[i].bowt.l * f);    }    for (i = 0; i < lm->n_bgprob; i++)	lm->bgprob[i].l = (int32)((lm->bgprob[i].l - lm->wip) * f) + iwip;    if (lm->n_tg > 0) {	for (i = 0; i < lm->n_tgprob; i++)	    lm->tgprob[i].l = (int32)((lm->tgprob[i].l - lm->wip) * f) + iwip;	for (i = 0; i < lm->n_tgbowt; i++)	    lm->tgbowt[i].l = (int32)(lm->tgbowt[i].l * f);    }    lm->lw = (float32) lw;    lm->wip = iwip;}static int32 lm_fread_int32 (lm_t *lm){    int32 val;        if (fread (&val, sizeof(int32), 1, lm->fp) != 1)	E_FATAL("fread failed\n");    if (lm->byteswap)	SWAP_INT32(&val);    return (val);}/* read in the LM control structure *//* 20040218 Arthur: This function is largely copied from Sphinx 2 because I don't want *  to spend too much time in writing file reading routine.  * I attached the comment in Sphinx 2 here.  It specifies the restriction of the Darpa file format.  **//* * Read control file describing multiple LMs, if specified. * File format (optional stuff is indicated by enclosing in []): *  *   [{ LMClassFileName LMClassFilename ... }] *   TrigramLMFileName LMName [{ LMClassName LMClassName ... }] *   TrigramLMFileName LMName [{ LMClassName LMClassName ... }] *   ... * (There should be whitespace around the { and } delimiters.) *  * This is an extension of the older format that had only TrigramLMFilenName * and LMName pairs.  The new format allows a set of LMClass files to be read * in and referred to by the trigram LMs.  (Incidentally, if one wants to use * LM classes in a trigram LM, one MUST use the -lmctlfn flag.  It is not * possible to read in a class-based trigram LM using the -lmfn flag.) *  * ARCHAN,  */lmset_t* lm_read_ctl(char *ctlfile,dict_t* dict,float64 lw, float64 wip, float64 uw,char *lmdumpdir,int32* n_lm, int32* n_alloclm,int32 dict_size){  FILE *ctlfp;  FILE *tmp;  char lmfile[4096], lmname[4096], str[4096];
  int32 isLM_IN_MEMORY;
  lmclass_set_t lmclass_set;  lmclass_t *lmclass, cl;  int32 n_lmclass, n_lmclass_used;  int32 i;  lm_t *lm;  lmset_t *lmset=NULL;  tmp=NULL;
  isLM_IN_MEMORY=0;  lmclass_set = lmclass_newset();	      E_INFO("Reading LM control file '%s'\n",ctlfile);  if (cmd_ln_int32 ("-lminmemory"))     isLM_IN_MEMORY = 1;      else    isLM_IN_MEMORY = 0;	      ctlfp = myfopen (ctlfile, "r");  if (fscanf (ctlfp, "%s", str) == 1) {    if (strcmp (str, "{") == 0) {      /* Load LMclass files */      while ((fscanf (ctlfp, "%s", str) == 1) && (strcmp (str, "}") != 0))	lmclass_set = lmclass_loadfile (lmclass_set, str);		          if (strcmp (str, "}") != 0)	E_FATAL("Unexpected EOF(%s)\n", ctlfile);		          if (fscanf (ctlfp, "%s", str) != 1)	str[0] = '\0';    }  } else    str[0] = '\0';	#if 0  tmp=myfopen("./tmp","w");  lmclass_set_dump(lmclass_set,tmp);  fclose(tmp);		   #endif  /* Fill in dictionary word id information for each LMclass word */  for (cl = lmclass_firstclass(lmclass_set);       lmclass_isclass(cl);       cl = lmclass_nextclass(lmclass_set, cl)) {        /*      For every words in the class, set the dictwid correctly       The following piece of code replace s2's kb_init_lmclass_dictwid (cl);      doesn't do any checking even the id is a bad dict id.       This only sets the information in the lmclass_set, but not       lm-2-dict or dict-2-lm map.  In Sphinx 3, they are done in       wid_dict_lm_map in wid.c.     */        lmclass_word_t w;    int32 wid;    for (w = lmclass_firstword(cl); lmclass_isword(w); w = lmclass_nextword(cl, w)) {      wid = dict_wordid (dict,lmclass_getword(w));#if 0      E_INFO("In class %s, Word %s, wid %d\n",cl->name,lmclass_getword(w),wid);#endif      lmclass_set_dictwid (w, wid);    }  }  /* At this point if str[0] != '\0', we have an LM filename */  n_lmclass = lmclass_get_nclass(lmclass_set);  lmclass = (lmclass_t *) ckd_calloc (n_lmclass, sizeof(lmclass_t));  E_INFO("Number of LM class specified %d in file %s\n",n_lmclass,ctlfile);  /* Read in one LM at a time */  while (str[0] != '\0') {    strcpy (lmfile, str);    if (fscanf (ctlfp, "%s", lmname) != 1)      E_FATAL("LMname missing after LMFileName '%s'\n", lmfile);        n_lmclass_used = 0;		    if (fscanf (ctlfp, "%s", str) == 1) {      if (strcmp (str, "{") == 0) {	while ((fscanf (ctlfp, "%s", str) == 1) &&	       (strcmp (str, "}") != 0)) {	  if (n_lmclass_used >= n_lmclass){	    E_FATAL("Too many LM classes specified for '%s'\n",		    lmfile);	  }	  lmclass[n_lmclass_used] = lmclass_get_lmclass (lmclass_set,							 str);	  if (! (lmclass_isclass(lmclass[n_lmclass_used])))	    E_FATAL("LM class '%s' not found\n", str);	  n_lmclass_used++;	}	if (strcmp (str, "}") != 0)	  E_FATAL("Unexpected EOF(%s)\n", ctlfile);	if (fscanf (ctlfp, "%s", str) != 1)	  str[0] = '\0';      }    } else      str[0] = '\0';		    if (n_lmclass_used > 0){      /*ARCHAN DON'T do txt reading for a moment, just try to 	read the dmp file, bypass it for a moment. 	lm_read_clm(lmfile, lmname,		  lw,uw,wip,		  lmclass, 		  lmset,		  dict,		  n_lmclass_used,		  lmdumpdir);*/      lm = (lm_t*) lm_read_dump (lmfile, lw, wip, uw, n_lmclass_used,lmclass,dict_size);      /* Initialize the fast trigram cache, with all entries invalid */      lm->tgcache = (lm_tgcache_entry_t *) ckd_calloc(LM_TGCACHE_SIZE, sizeof(lm_tgcache_entry_t));      for (i = 0; i < LM_TGCACHE_SIZE; i++)	lm->tgcache[i].lwid[0] = BAD_S3LMWID;    }    else{      /*Again, bypass this currently, 	lm_read_txt(lmfile, lmname,lw,uw,wip,lmset,dict,lmdumpdir);*/      lm = (lm_t*) lm_read_dump (lmfile, lw, wip, uw,0,NULL,dict_size);      /* Initialize the fast trigram cache, with all entries invalid */      lm->tgcache = (lm_tgcache_entry_t *) ckd_calloc(LM_TGCACHE_SIZE, sizeof(lm_tgcache_entry_t));      for (i = 0; i < LM_TGCACHE_SIZE; i++)	lm->tgcache[i].lwid[0] = BAD_S3LMWID;    }    if(*n_lm == *n_alloclm){      lmset= (lmset_t *) ckd_realloc(lmset,(*n_alloclm+16)*sizeof(lmset_t));      *n_alloclm+=16;    }    lmset[*n_lm].name = ckd_salloc(lmname);    lmset[*n_lm].lm=lm;    *n_lm+=1;  }    E_INFO("No. of LM set allocated %d, no. of LM %d \n",*n_alloclm,*n_lm);  fclose (ctlfp);  return lmset;}static int32 lm_build_lmclass_info(lm_t *lm,float64 lw, float64 uw, float64 wip,int32 n_lmclass_used,lmclass_t *lmclass){  int i;  if(n_lmclass_used >0){    lm->lmclass=(lmclass_t*) ckd_calloc(n_lmclass_used,sizeof(lmclass_t));    for(i=0; i<n_lmclass_used ;i++)      lm->lmclass[i]=lmclass[i];  }else    lm->lmclass= NULL;  lm->n_lmclass = n_lmclass_used;  lm->inclass_ugscore = (int32*)ckd_calloc(lm->dict_size,sizeof(int32));  E_INFO("LM->inclass_ugscore size %d\n",lm->dict_size);  E_INFO("Number of class used %d\n",n_lmclass_used);  return 1;}/* * Read LM dump (<lmname>.DMP) file and make it the current LM. * Same interface as lm_read except that the filename refers to a .DMP file. */static lm_t *lm_read_dump (char *file, float64 lw, float64 wip, float64 uw,int32 n_lmclass_used, lmclass_t *lmclass,int32 dict_size){    lm_t *lm;    int32 i, j, k, vn;    char str[1024];    char *tmp_word_str;    s3lmwid_t startwid, endwid;    int32 isLM_IN_MEMORY=0;    if (cmd_ln_int32 ("-lminmemory"))       isLM_IN_MEMORY = 1;        else      isLM_IN_MEMORY = 0;    lm = (lm_t *) ckd_calloc (1, sizeof(lm_t));        lm->dict_size=dict_size;    if ((lm->fp = fopen (file, "rb")) == NULL)	E_FATAL_SYSTEM("fopen(%s,rb) failed\n", file);        /* Standard header string-size; set byteswap flag based on this */    if (fread (&k, sizeof(int32), 1, lm->fp) != 1)	E_FATAL("fread(%s) failed\n", file);    if ((size_t)k == strlen(darpa_hdr)+1)	lm->byteswap = 0;    else {	SWAP_INT32(&k);	if ((size_t)k == strlen(darpa_hdr)+1)
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -