📄 lm.c

📁 CMU大名鼎鼎的SPHINX－3大词汇量连续语音识别系统
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
/* Locate a specific bigram within a bigram list */static int32 find_bg (bg_t *bg, int32 n, s3lmwid_t w){    int32 i, b, e;        /* Binary search until segment size < threshold */    b = 0;    e = n;    while (e-b > BINARY_SEARCH_THRESH) {	i = (b+e)>>1;	if (bg[i].wid < w)	    b = i+1;	else if (bg[i].wid > w)	    e = i;	else	    return i;    }    /* Linear search within narrowed segment */    for (i = b; (i < e) && (bg[i].wid != w); i++);    return ((i < e) ? i : -1);}int32 lm_bglist (lm_t *lm, s3lmwid_t w1, bg_t **bgptr, int32 *bowt){    int32 n;    if (NOT_S3LMWID(w1) || (w1 >= lm->n_ug))	E_FATAL("Bad w1 argument (%d) to lm_bglist\n", w1);    n = (lm->n_bg > 0) ? lm->ug[w1+1].firstbg - lm->ug[w1].firstbg : 0;        if (n > 0) {	if (! lm->membg[w1].bg)	    load_bg (lm, w1);	lm->membg[w1].used = 1;	*bgptr = lm->membg[w1].bg;	*bowt = lm->ug[w1].bowt.l;    } else {	*bgptr = NULL;	*bowt = 0;    }        return (n);}#if 0 /*Obsolete, not used */int32 lm_bg_wordprob (lm_t *lm, s3lmwid_t lwid, int32 th, wordprob_t *wp, int32 *bowt){    bg_t *bgptr;    int32 i, j, n, ugprob, bgprob;    s3wid_t w;        n = lm_bglist (lm, lwid, &bgptr, bowt);    ugprob = lm_ug_score (lm, lwid);        /* Convert bglist to wordprob */    for (i = 0, j = 0; i < n; i++, bgptr++) {	w = lm->ug[bgptr->wid].dictwid;	if (IS_S3WID (w)) {	    bgprob = LM_BGPROB(lm, bgptr);	    	    if (ugprob + bgprob >= th) {	/* ABSOLUTE prob (count) >= min thresh */		wp[j].wid = w;		wp[j].prob = bgprob;		j++;	    }	}    }        return j;}#endif/* *  This function look-ups the bigram score of p(lw2|lw1) *  The information for lw2 and w2 are repeated because the legacy  *  implementation(since s3.2) of vithist used only LM wid rather  *  than dictionary wid.   */int32 lm_bg_score (lm_t *lm, s3lmwid_t lw1, s3lmwid_t lw2, s3wid_t w2){    int32 i, n, score;    bg_t *bg=0;    if ((lm->n_bg == 0) || (NOT_S3LMWID(lw1)))	return (lm_ug_score (lm, lw2, w2));    lm->n_bg_score++;    if (NOT_S3LMWID(lw2) || (lw2 >= lm->n_ug))	E_FATAL("Bad lw2 argument (%d) to lm_bg_score\n", lw2);        n = lm->ug[lw1+1].firstbg - lm->ug[lw1].firstbg;        if (n > 0) {	if (! lm->membg[lw1].bg)	    load_bg (lm, lw1);	lm->membg[lw1].used = 1;	bg = lm->membg[lw1].bg;	i = find_bg (bg, n, lw2);    } else	i = -1;        if (i >= 0) {	score = lm->bgprob[bg[i].probid].l;	if(lm->inclass_ugscore){ /*Only add within class prob if class information exists.				  Is actually ok to just add the score because if the word				  is not within-class. The returning scores will be 0. I just				  love to safe-guard it :-). 				 */	  score += lm->inclass_ugscore[w2];	}	lm->access_type = 2;    } else {	lm->n_bg_bo++;	lm->access_type = 1;	score = lm->ug[lw1].bowt.l + lm->ug[lw2].prob.l;    }#if 0    printf ("      %5d %5d -> %8d\n", lw1, lw2, score);#endif    return (score);}static void load_tg (lm_t *lm, s3lmwid_t lw1, s3lmwid_t lw2){    int32 i, n, b;    int32 t = -1; /* Let's make sure that if t isn't initialized after the		   * "if" statement below, it makes things go bad */    int32 isLM_IN_MEMORY=0;    bg_t *bg;    tg_t *tg;    tginfo_t *tginfo;    if (cmd_ln_int32 ("-lminmemory"))       isLM_IN_MEMORY = 1;        else      isLM_IN_MEMORY = 0;        /* First allocate space for tg information for bg lw1,lw2 */    tginfo = (tginfo_t *) ckd_malloc (sizeof(tginfo_t));    tginfo->w1 = lw1;    tginfo->tg = NULL;    tginfo->next = lm->tginfo[lw2];    lm->tginfo[lw2] = tginfo;        /* Locate bigram lw1,lw2 */    b = lm->ug[lw1].firstbg;    n = lm->ug[lw1+1].firstbg - b;        /* Make sure bigrams for lw1, if any, loaded into memory */    if (n > 0) {	if (! lm->membg[lw1].bg)	    load_bg (lm, lw1);	lm->membg[lw1].used = 1;	bg = lm->membg[lw1].bg;    }    /* At this point, n = #bigrams for lw1 */    if ((n > 0) && ((i = find_bg (bg, n, lw2)) >= 0)) {	tginfo->bowt = lm->tgbowt[bg[i].bowtid].l;		/* Find t = Absolute first trigram index for bigram lw1,lw2 */	b += i;			/* b = Absolute index of bigram lw1,lw2 on disk */	t = lm->tg_segbase[b >> lm->log_bg_seg_sz];	t += bg[i].firsttg;		/* Find #tg for bigram w1,w2 */	n = lm->tg_segbase[(b+1) >> lm->log_bg_seg_sz];	n += bg[i+1].firsttg;	n -= t;	tginfo->n_tg = n;    } else {			/* No bigram w1,w2 */	tginfo->bowt = 0;	n = tginfo->n_tg = 0;    }    /* "t" has not been assigned any meanigful value, so if you use it     * beyond this point, make sure it's been properly assigned.     */   //	assert (t != -1);    /* At this point, n = #trigrams for lw1,lw2.  Read them in */    if (isLM_IN_MEMORY) {		/* RAH, already have this in memory */      if (n > 0){	assert(t != -1);	tg = tginfo->tg = &lm->tg[t];      }    } else {    if (n > 0) {	tg = tginfo->tg = (tg_t *) ckd_calloc (n, sizeof(tg_t));	if (fseek (lm->fp, lm->tgoff + t*sizeof(tg_t), SEEK_SET) < 0)	    E_FATAL_SYSTEM("fseek failed\n");	if (fread (tg, sizeof(tg_t), n, lm->fp) != (size_t)n)	    E_FATAL("fread(tg, %d at %d) failed\n", n, lm->tgoff);	if (lm->byteswap) {	    for (i = 0; i < n; i++) {		SWAP_INT16(&(tg[i].wid));		SWAP_INT16(&(tg[i].probid));	    }	}    }    }    lm->n_tg_fill++;    lm->n_tg_inmem += n;}/* Similar to find_bg */static int32 find_tg (tg_t *tg, int32 n, s3lmwid_t w){    int32 i, b, e;    b = 0;    e = n;    while (e-b > BINARY_SEARCH_THRESH) {	i = (b+e)>>1;	if (tg[i].wid < w)	    b = i+1;	else if (tg[i].wid > w)	    e = i;	else	    return i;    }        for (i = b; (i < e) && (tg[i].wid != w); i++);    return ((i < e) ? i : -1);}int32 lm_tglist (lm_t *lm, s3lmwid_t lw1, s3lmwid_t lw2, tg_t **tgptr, int32 *bowt){    tginfo_t *tginfo, *prev_tginfo;    if (lm->n_tg <= 0) {	*tgptr = NULL;	*bowt = 0;	return 0;    }        if (NOT_S3LMWID(lw1) || (lw1 >= lm->n_ug))	E_FATAL("Bad lw1 argument (%d) to lm_tglist\n", lw1);    if (NOT_S3LMWID(lw2) || (lw2 >= lm->n_ug))	E_FATAL("Bad lw2 argument (%d) to lm_tglist\n", lw2);    prev_tginfo = NULL;    for (tginfo = lm->tginfo[lw2]; tginfo; tginfo = tginfo->next) {	if (tginfo->w1 == lw1)	    break;	prev_tginfo = tginfo;    }        if (! tginfo) {    	load_tg (lm, lw1, lw2);	tginfo = lm->tginfo[lw2];    } else if (prev_tginfo) {	prev_tginfo->next = tginfo->next;	tginfo->next = lm->tginfo[lw2];	lm->tginfo[lw2] = tginfo;    }    tginfo->used = 1;    *tgptr = tginfo->tg;    *bowt = tginfo->bowt;    return (tginfo->n_tg);}/* *  This function look-ups the trigram score of p(lw3|lw2,lw1) *  and compute the in-class ug probability of w3. *  The information for lw3 and w3 are repeated because the legacy  *  implementation(since s3.2) of vithist used only LM wid rather  *  than dictionary wid.   *   */int32 lm_tg_score (lm_t *lm, s3lmwid_t lw1, s3lmwid_t lw2, s3lmwid_t lw3, s3wid_t w3){    int32 i, h, n, score;    tg_t *tg;    tginfo_t *tginfo, *prev_tginfo;        if ((lm->n_tg == 0) || (NOT_S3LMWID(lw1)))	return (lm_bg_score (lm, lw2, lw3, w3));    lm->n_tg_score++;    if (NOT_S3LMWID(lw1) || (lw1 >= lm->n_ug))	E_FATAL("Bad lw1 argument (%d) to lm_tg_score\n", lw1);    if (NOT_S3LMWID(lw2) || (lw2 >= lm->n_ug))	E_FATAL("Bad lw2 argument (%d) to lm_tg_score\n", lw2);    if (NOT_S3LMWID(lw3) || (lw3 >= lm->n_ug))	E_FATAL("Bad lw3 argument (%d) to lm_tg_score\n", lw3);    /* Lookup tgcache first; compute hash(lw1, lw2, lw3) */    h = ((lw1 & 0x000003ff) << 21) + ((lw2 & 0x000003ff) << 11) + (lw3 & 0x000007ff);    h %= LM_TGCACHE_SIZE;    if ((lm->tgcache[h].lwid[0] == lw1) &&	(lm->tgcache[h].lwid[1] == lw2) &&	(lm->tgcache[h].lwid[2] == lw3)) {	lm->n_tgcache_hit++;	return lm->tgcache[h].lscr;    }    prev_tginfo = NULL;    for (tginfo = lm->tginfo[lw2]; tginfo; tginfo = tginfo->next) {	if (tginfo->w1 == lw1)	    break;	prev_tginfo = tginfo;    }    if (! tginfo) {    	load_tg (lm, lw1, lw2);	tginfo = lm->tginfo[lw2];    } else if (prev_tginfo) {	prev_tginfo->next = tginfo->next;	tginfo->next = lm->tginfo[lw2];	lm->tginfo[lw2] = tginfo;    }    tginfo->used = 1;    /* Trigrams for w1,w2 now in memory; look for w1,w2,w3 */    n = tginfo->n_tg;    tg = tginfo->tg;    if ((i = find_tg (tg, n, lw3)) >= 0) {	score = lm->tgprob[tg[i].probid].l ;	if(lm->inclass_ugscore){ /*Only add within class prob if class information exists.				  Is actually ok to just add the score because if the word				  is not within-class. The returning scores will be 0. 				 */	  score += lm->inclass_ugscore[w3];	}	lm->access_type = 3;    } else {	lm->n_tg_bo++;	score = tginfo->bowt + lm_bg_score(lm, lw2, lw3, w3);    }#if 0    printf ("%5d %5d %5d -> %8d\n", lw1, lw2, lw3, score);#endif        lm->tgcache[h].lwid[0] = lw1;    lm->tgcache[h].lwid[1] = lw2;    lm->tgcache[h].lwid[2] = lw3;    lm->tgcache[h].lscr = score;        return (score);}s3lmwid_t lm_wid (lm_t *lm, char *word){    int32 i;        for (i = 0; i < lm->n_ug; i++)	if (strcmp (lm->wordstr[i], word) == 0)	    return ((s3lmwid_t) i);        return BAD_S3LMWID;}void lm_free (lm_t *lm){  int i;  for (i=0;i<lm->n_ug;i++)     ckd_free ((void *) lm->wordstr[i]);	/*  */  ckd_free ((void *) lm->membg);  ckd_free ((void *) lm->wordstr);  ckd_free ((void *) lm->tgcache);  ckd_free ((void *) lm->tg_segbase);  ckd_free ((void *) lm->tgprob);  ckd_free ((void *) lm->tgbowt);  ckd_free ((void *) lm->bgprob);  ckd_free ((void *) lm->tginfo);  ckd_free ((void *) lm->ug);    ckd_free ((void *) lm);  }int32 lm_rawscore (lm_t *lm, int32 score, float64 lwf){    if (lwf != 1.0)        score /= (int32)lwf;    score -= lm->wip;    score /= (int32)lm->lw;        return score;}#if (_LM_TEST_)static int32 sentence_lmscore (lm_t *lm, char *line){    char *word[1024];    s3lmwid_t w[1024];    int32 nwd, score, tgscr;    int32 i, j;        if ((nwd = str2words (line, word, 1020)) < 0)	E_FATAL("Increase word[] and w[] arrays size\n");        w[0] = BAD_S3LMWID;    w[1] = lm_wid (lm, S3_START_WORD);    if (NOT_S3LMWID(w[1]))	E_FATAL("Unknown word: %s\n", S3_START_WORD);        for (i = 0; i < nwd; i++) {	w[i+2] = lm_wid (lm, word[i]);	if (NOT_S3LMWID(w[i+2])) {	    E_ERROR("Unknown word: %s\n", word[i]);	    return 0;	}    }    w[i+2] = lm_wid (lm, S3_FINISH_WORD);    if (NOT_S3LMWID(w[i+2]))	E_FATAL("Unknown word: %s\n", S3_FINISH_WORD);        score = 0;    for (i = 0, j = 2; i <= nwd; i++, j++) {	tgscr = lm_tg_score (lm, w[j-2], w[j-1], w[j]);	score += tgscr;	printf ("\t%10d %s\n", tgscr, lm->wordstr[w[j]]);    }        return (score);}main (int32 argc, char *argv[]){    char line[4096];    int32 score, k;    lm_t *lm;        if (argc < 2)	E_FATAL("Usage: %s <LMdumpfile>\n", argv[0]);    logs3_init (1.0001);    lm = lm_read (argv[1], 9.5, 0.2);    if (1) {			/* Short cut this so we can test for memory leaks */      for (;;) {	printf ("> ");	if (fgets (line, sizeof(line), stdin) == NULL)	    break;		score = sentence_lmscore (lm, line);	k = strlen(line);	if (line[k-1] == '\n')	    line[k-1] = '\0';	printf ("LMScr(%s) = %d\n", line, score);      }    } /*  */    lm_free(lm);    exit (0);}#endif
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -