⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 lm.c

📁 CMU大名鼎鼎的SPHINX-3大词汇量连续语音识别系统
💻 C
📖 第 1 页 / 共 3 页
字号:
	    lm->byteswap = 1;	else {	    SWAP_INT32(&k);	    E_FATAL("Bad magic number: %d(%08x), not an LM dumpfile??\n", k, k);	}    }    /* Read and verify standard header string */    if (fread (str, sizeof (char), k, lm->fp) != (size_t)k)	E_FATAL("fread(%s) failed\n", file);    if (strncmp (str, darpa_hdr, k) != 0)	E_FATAL("Bad header\n");    /* Original LM filename string size and string */    k = lm_fread_int32 (lm);    if ((k < 1) || (k > 1024))	E_FATAL("Bad original filename size: %d\n", k);    if (fread (str, sizeof (char), k, lm->fp) != (size_t)k)	E_FATAL("fread(%s) failed\n", file);    /* Version#.  If present (must be <= 0); otherwise it's actually the unigram count */    vn = lm_fread_int32 (lm);    if (vn <= 0) {	/* Read and skip orginal file timestamp; (later compare timestamps) */	k = lm_fread_int32 (lm);	/* Read and skip format description */	for (;;) {	    if ((k = lm_fread_int32 (lm)) == 0)		break;	    if (fread (str, sizeof(char), k, lm->fp) != (size_t)k)		E_FATAL("fread(%s) failed\n", file);	}	/* Read log_bg_seg_sz if present */	if (vn <= -2) {	    k = lm_fread_int32 (lm);	    if ((k < 1) || (k > 15))		E_FATAL("log2(bg_seg_sz) outside range 1..15\n", k);	    lm->log_bg_seg_sz = k;	} else	    lm->log_bg_seg_sz = LOG2_BG_SEG_SZ;	/* Default */	/* Read #ug */	lm->n_ug = lm_fread_int32 (lm);    } else {	/* No version number, actually a unigram count */	lm->n_ug = vn;	lm->log_bg_seg_sz = LOG2_BG_SEG_SZ;	/* Default */    }    if ((lm->n_ug <= 0) || (lm->n_ug >= MAX_S3LMWID))	E_FATAL("Bad #ug: %d (must be >0, <%d\n", lm->n_ug, MAX_S3LMWID);    lm->bg_seg_sz = 1 << lm->log_bg_seg_sz;    /* #bigrams */    lm->n_bg = lm_fread_int32 (lm);    if (lm->n_bg < 0)	E_FATAL("Bad #bigrams: %d\n", lm->n_bg);    /* #trigrams */    lm->n_tg = lm_fread_int32 (lm);    if (lm->n_tg < 0)	E_FATAL("Bad #trigrams: %d\n", lm->n_tg);    /* Read ug; remember sentinel ug at the end! */    lm->ug = (ug_t *) ckd_calloc (lm->n_ug+1, sizeof(ug_t));    if (fread (lm->ug, sizeof(ug_t), lm->n_ug+1, lm->fp) != (size_t)(lm->n_ug+1))	E_FATAL("fread(%s) failed\n", file);    if (lm->byteswap)	for (i = 0; i <= lm->n_ug; i++) {	    SWAP_INT32(&(lm->ug[i].prob.l));	    SWAP_INT32(&(lm->ug[i].bowt.l));	    SWAP_INT32(&(lm->ug[i].firstbg));	}    E_INFO("%8d ug\n", lm->n_ug);    /* RAH, 5.1.01 - Let's try reading the whole damn thing in here   */    if (isLM_IN_MEMORY) {      lm->bg = (bg_t *) ckd_calloc (lm->n_bg+1,sizeof(bg_t));      lm->tg = (tg_t *) ckd_calloc (lm->n_tg+1,sizeof(tg_t));      if (lm->n_bg > 0) {       /* Read bigrams; remember sentinel at the end */	lm->bgoff = ftell (lm->fp);	fread (lm->bg, lm->n_bg+1,sizeof(bg_t),lm->fp);	E_INFO("Read %8d bigrams [in memory]\n", lm->n_bg);		lm->membg = (membg_t *) ckd_calloc (lm->n_ug, sizeof(membg_t));      }            if (lm->n_tg > 0) {       /* Read trigrams */	lm->tgoff = ftell (lm->fp);	fread (lm->tg,lm->n_tg,sizeof(tg_t),lm->fp);	E_INFO("Read %8d trigrams [in memory]\n", lm->n_tg);		lm->tginfo = (tginfo_t **) ckd_calloc (lm->n_ug, sizeof(tginfo_t *));      }    } else {      lm->bg = NULL;      lm->tg = NULL;            /* Skip bigrams; remember sentinel at the end */      if (lm->n_bg > 0) {	lm->bgoff = ftell (lm->fp);	fseek (lm->fp, (lm->n_bg+1) * sizeof(bg_t), SEEK_CUR);	E_INFO("%8d bigrams [on disk]\n", lm->n_bg);	lm->membg = (membg_t *) ckd_calloc (lm->n_ug, sizeof(membg_t));      }            /* Skip trigrams */      if (lm->n_tg > 0) {	lm->tgoff = ftell (lm->fp);	fseek (lm->fp, lm->n_tg * sizeof(tg_t), SEEK_CUR);	E_INFO("%8d trigrams [on disk]\n", lm->n_tg);		lm->tginfo = (tginfo_t **) ckd_calloc (lm->n_ug, sizeof(tginfo_t *));      }    }        if (lm->n_bg > 0) {	/* Bigram probs table size */	lm->n_bgprob = lm_fread_int32 (lm);	if ((lm->n_bgprob <= 0) || (lm->n_bgprob > 65536))	    E_FATAL("Bad bigram prob table size: %d\n", lm->n_bgprob);		/* Allocate and read bigram probs table */	lm->bgprob = (lmlog_t *) ckd_calloc (lm->n_bgprob, sizeof (lmlog_t));	if (fread(lm->bgprob, sizeof(lmlog_t), lm->n_bgprob, lm->fp) !=	    (size_t)lm->n_bgprob)	    E_FATAL("fread(%s) failed\n", file);	if (lm->byteswap) {	    for (i = 0; i < lm->n_bgprob; i++)		SWAP_INT32(&(lm->bgprob[i].l));	}	E_INFO("%8d bigram prob entries\n", lm->n_bgprob);    }    if (lm->n_tg > 0) {	/* Trigram bowt table size */	lm->n_tgbowt = lm_fread_int32 (lm);	if ((lm->n_tgbowt <= 0) || (lm->n_tgbowt > 65536))	  E_FATAL("Bad trigram bowt table size: %d\n", lm->n_tgbowt);		/* Allocate and read trigram bowt table */	lm->tgbowt = (lmlog_t *) ckd_calloc (lm->n_tgbowt, sizeof (lmlog_t));	if (fread (lm->tgbowt, sizeof (lmlog_t), lm->n_tgbowt, lm->fp) !=	    (size_t)lm->n_tgbowt)	  E_FATAL("fread(%s) failed\n", file);	if (lm->byteswap) {	  for (i = 0; i < lm->n_tgbowt; i++)	    SWAP_INT32(&(lm->tgbowt[i].l));	}	E_INFO("%8d trigram bowt entries\n", lm->n_tgbowt);	/* Trigram prob table size */	lm->n_tgprob = lm_fread_int32 (lm);	if ((lm->n_tgprob <= 0) || (lm->n_tgprob > 65536))	  E_FATAL("Bad trigram bowt table size: %d\n", lm->n_tgprob);		/* Allocate and read trigram bowt table */	lm->tgprob = (lmlog_t *) ckd_calloc (lm->n_tgprob, sizeof (lmlog_t));	if (fread (lm->tgprob, sizeof (lmlog_t), lm->n_tgprob, lm->fp) !=	    (size_t)lm->n_tgprob)	  E_FATAL("fread(%s) failed\n", file);	if (lm->byteswap) {	  for (i = 0; i < lm->n_tgprob; i++)	    SWAP_INT32(&(lm->tgprob[i].l));	}	E_INFO("%8d trigram prob entries\n", lm->n_tgprob);	/* Trigram seg table size */	k = lm_fread_int32 (lm);	if (k != (lm->n_bg+1)/lm->bg_seg_sz+1)	    E_FATAL("Bad trigram seg table size: %d\n", k);		/* Allocate and read trigram seg table */	lm->tg_segbase = (int32 *) ckd_calloc (k, sizeof(int32));	if (fread (lm->tg_segbase, sizeof(int32), k, lm->fp) != (size_t)k)	    E_FATAL("fread(%s) failed\n", file);	if (lm->byteswap) {	    for (i = 0; i < k; i++)		SWAP_INT32(&(lm->tg_segbase[i]));	}	E_INFO("%8d trigram segtable entries (%d segsize)\n", k, lm->bg_seg_sz);    }    /* Read word string names */    k = lm_fread_int32 (lm);    if (k <= 0)	E_FATAL("Bad wordstrings size: %d\n", k);        tmp_word_str = (char *) ckd_calloc (k, sizeof (char));    if (fread (tmp_word_str, sizeof(char), k, lm->fp) != (size_t)k)	E_FATAL("fread(%s) failed\n", file);    /* First make sure string just read contains n_ug words (PARANOIA!!) */    for (i = 0, j = 0; i < k; i++)	if (tmp_word_str[i] == '\0')	    j++;    if (j != lm->n_ug)	E_FATAL("Bad #words: %d\n", j);    /* Break up string just read into words */    startwid = endwid = BAD_S3LMWID;    lm->wordstr = (char **) ckd_calloc (lm->n_ug, sizeof(char *));    j = 0;    for (i = 0; i < lm->n_ug; i++) {	if (strcmp (tmp_word_str+j, S3_START_WORD) == 0)	    startwid = i;	else if (strcmp (tmp_word_str+j, S3_FINISH_WORD) == 0)	    endwid = i;	lm->wordstr[i] = (char *) ckd_salloc (tmp_word_str+j);		j += strlen(tmp_word_str+j) + 1;    }    free (tmp_word_str);    E_INFO("%8d word strings\n", i);        /* Force ugprob(<s>) = MIN_PROB_F */    if (IS_S3LMWID(startwid)) {	lm->ug[startwid].prob.f = MIN_PROB_F;	lm->startlwid = startwid;    }        /* Force bowt(</s>) = MIN_PROB_F */    if (IS_S3LMWID(endwid)) {	lm->ug[endwid].bowt.f = MIN_PROB_F;	lm->finishlwid = endwid;    }    if(n_lmclass_used>0) {      lm_build_lmclass_info(lm,lw,uw,wip,n_lmclass_used,lmclass);    }    lm2logs3 (lm, uw);	/* Applying unigram weight; convert to logs3 values */        /* Apply the new lw and wip values */    lm->lw = 1.0;	/* The initial settings for lw and wip */    lm->wip = 0;	/* logs3(1.0) */    lm_set_param (lm, lw, wip);    return lm;}lm_t *lm_read (char *file, float64 lw, float64 wip, float64 uw){    int32 i, u;    lm_t *lm;    int32 isLM_IN_MEMORY=0;          if (! file)	E_FATAL("No LM file\n");    if (lw <= 0.0)	E_FATAL("lw = %e\n", lw);    if (wip <= 0.0)	E_FATAL("wip = %e\n", wip);    if ((uw < 0.0) || (uw > 1.0))	E_FATAL("uw = %e\n", uw);        E_INFO ("LM read('%s', lw= %.2f, wip= %d, uw= %.2f)\n", file, lw, logs3(wip), uw);    if (cmd_ln_int32 ("-lminmemory"))       isLM_IN_MEMORY = 1;        else      isLM_IN_MEMORY = 0;        /* For now, only dump files can be read; they are created offline */    lm = lm_read_dump (file, lw, wip, uw,0,NULL,0);    for (u = 0; u < lm->n_ug; u++)	lm->ug[u].dictwid = BAD_S3WID;        /* Initialize the fast trigram cache, with all entries invalid */    lm->tgcache = (lm_tgcache_entry_t *) ckd_calloc(LM_TGCACHE_SIZE, sizeof(lm_tgcache_entry_t));    for (i = 0; i < LM_TGCACHE_SIZE; i++)	lm->tgcache[i].lwid[0] = BAD_S3LMWID;        return lm;}/* * Free stale bigram and trigram info, those not used since last reset. */void lm_cache_reset (lm_t *lm){    int32 i, n_bgfree, n_tgfree;    tginfo_t *tginfo, *next_tginfo, *prev_tginfo;    int32 isLM_IN_MEMORY=0;    n_bgfree = n_tgfree = 0;        if (cmd_ln_int32 ("-lminmemory"))       isLM_IN_MEMORY = 1;        else      isLM_IN_MEMORY = 0;    /* ARCHAN: RAH only short-circult this function only */    if (isLM_IN_MEMORY)		/* RAH We are going to short circuit this if we are running with the lm in memory */    return;      if ((lm->n_bg > 0) && (! lm->bg)) {	/* Disk-based; free "stale" bigrams */	for (i = 0; i < lm->n_ug; i++) {	    if (lm->membg[i].bg && (! lm->membg[i].used)) {		lm->n_bg_inmem -= lm->ug[i+1].firstbg - lm->ug[i].firstbg;		free (lm->membg[i].bg);		lm->membg[i].bg = NULL;		n_bgfree++;	    }	    lm->membg[i].used = 0;	}    }        if (lm->n_tg > 0) {	for (i = 0; i < lm->n_ug; i++) {	    prev_tginfo = NULL;	    for (tginfo = lm->tginfo[i]; tginfo; tginfo = next_tginfo) {		next_tginfo = tginfo->next;				if (! tginfo->used) {		    if ((! lm->tg) && tginfo->tg) {			lm->n_tg_inmem -= tginfo->n_tg;			free (tginfo->tg);			n_tgfree++;		    }		    		    free (tginfo);		    if (prev_tginfo)			prev_tginfo->next = next_tginfo;		    else			lm->tginfo[i] = next_tginfo;		} else {		    tginfo->used = 0;		    prev_tginfo = tginfo;		}	    }	}    }    if ((n_tgfree > 0) || (n_bgfree > 0)) {	E_INFO("%d tg frees, %d in mem; %d bg frees, %d in mem\n",	       n_tgfree, lm->n_tg_inmem, n_bgfree, lm->n_bg_inmem);    }}void lm_cache_stats_dump (lm_t *lm){    E_INFO("%9d tg(), %9d tgcache, %8d bo; %5d fills, %8d in mem (%.1f%%)\n",	   lm->n_tg_score, lm->n_tgcache_hit, lm->n_tg_bo, lm->n_tg_fill, lm->n_tg_inmem,	   (lm->n_tg_inmem*100.0)/(lm->n_tg+1));    E_INFO("%8d bg(), %8d bo; %5d fills, %8d in mem (%.1f%%)\n",	   lm->n_bg_score, lm->n_bg_bo, lm->n_bg_fill, lm->n_bg_inmem,	   (lm->n_bg_inmem*100.0)/(lm->n_bg+1));        lm->n_tgcache_hit = 0;    lm->n_tg_fill = 0;    lm->n_tg_score = 0;    lm->n_tg_bo = 0;    lm->n_bg_fill = 0;    lm->n_bg_score = 0;    lm->n_bg_bo = 0;}int32 lm_ug_score (lm_t *lm, s3lmwid_t lwid, s3wid_t wid){    if (NOT_S3LMWID(lwid) || (lwid >= lm->n_ug))	E_FATAL("Bad argument (%d) to lm_ug_score\n", lwid);    lm->access_type = 1;        if(lm->inclass_ugscore)      return (lm->ug[lwid].prob.l +lm->inclass_ugscore[wid]);    else      return (lm->ug[lwid].prob.l );}int32 lm_uglist (lm_t *lm, ug_t **ugptr){    *ugptr = lm->ug;    return (lm->n_ug);}/* This create a mapping from either the unigram or words in a class*/int32 lm_ug_wordprob (lm_t *lm, dict_t *dict,int32 th, wordprob_t *wp){    int32 i, j, n, p;    s3wid_t w,dictid;    lmclass_t lmclass;    lmclass_word_t lm_cw;    n = lm->n_ug;        for (i = 0, j = 0; i < n; i++) {	w = lm->ug[i].dictwid;	if (IS_S3WID(w)) { /*Is w>0? Then it can be either wid or class id*/	  if (w <  LM_CLASSID_BASE){ /*It is just a word*/	    if ((p = lm->ug[i].prob.l) >= th) {		wp[j].wid = w;		wp[j].prob = p;		j++;	    }	  }else{ /* It is a class */	    lmclass=LM_CLASSID_TO_CLASS(lm,w); /* Get the class*/	    lm_cw=lmclass_firstword(lmclass);	    while(lmclass_isword(lm_cw)){	      dictid =lmclass_getwid(lm_cw); 	      /*E_INFO("Lookup dict_id using dict_basewid %d\n",dictid);*/	      if(IS_S3WID(dictid)){		if(dictid !=dict_basewid(dict,dictid)){		  dictid=dict_basewid(dict,dictid);		}		if((p=lm->ug[i].prob.l+lm->inclass_ugscore[dictid])>=th){		  wp[j].wid=dictid;		  wp[j].prob=lm->ug[i].prob.l;		  j++;		}	      }else{		E_INFO("Word %s cannot be found \n", lmclass_getword(lm_cw));	      }	      lm_cw= lmclass_nextword (lmclass,lm_cw);	      	    }	  }	}    }        return j;}/* * Load bigrams for the given unigram (LMWID) lw1 from disk into memory */static void load_bg (lm_t *lm, s3lmwid_t lw1){    int32 i, n, b;    bg_t *bg;    int32 isLM_IN_MEMORY=0;        b = lm->ug[lw1].firstbg;		/* Absolute first bg index for ug lw1 */    n = lm->ug[lw1+1].firstbg - b;	/* Not including guard/sentinel */    if (cmd_ln_int32 ("-lminmemory"))       isLM_IN_MEMORY = 1;        else      isLM_IN_MEMORY = 0;      if (isLM_IN_MEMORY)		/* RAH, if LM_IN_MEMORY, then we don't need to go get it. */    bg = lm->membg[lw1].bg = &lm->bg[b];  else {    bg = lm->membg[lw1].bg = (bg_t *) ckd_calloc (n+1, sizeof(bg_t));        if (fseek (lm->fp, lm->bgoff + b*sizeof(bg_t), SEEK_SET) < 0)	E_FATAL_SYSTEM ("fseek failed\n");        /* Need to read n+1 because obtaining tg count for one bg also depends on next bg */    if (fread (bg, sizeof(bg_t), n+1, lm->fp) != (size_t)(n+1))	E_FATAL("fread failed\n");    if (lm->byteswap) {	for (i = 0; i <= n; i++) {	    SWAP_INT16(&(bg[i].wid));	    SWAP_INT16(&(bg[i].probid));	    SWAP_INT16(&(bg[i].bowtid));	    SWAP_INT16(&(bg[i].firsttg));	}    }  }    lm->n_bg_fill++;    lm->n_bg_inmem += n;}#define BINARY_SEARCH_THRESH	16

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -