📄 lm.c
字号:
/* ==================================================================== * Copyright (c) 1999-2004 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * lm.c -- Disk-based backoff word trigram LM module. * * ********************************************** * CMU ARPA Speech Project * * Copyright (c) 1997 Carnegie Mellon University. * ALL RIGHTS RESERVED. * ********************************************** * * HISTORY * * 20.Apr.2001 RAH (rhoughton@mediasite.com, ricky.houghton@cs.cmu.edu) * Adding lm_free() to free allocated memory * * 30-Dec-2000 Rita Singh (rsingh@cs.cmu.edu) at Carnegie Mellon University * Removed language weight application to wip. To maintain * comparability between s3decode and current decoder. Does * not affect decoding performance. * * 23-Feb-2000 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Bugfix: Applied language weight to word insertion penalty. * * 24-Jun-97 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added lm_t.access_type; made lm_wid externally visible. * * 24-Jun-97 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added lm_t.log_bg_seg_sz and lm_t.bg_seg_sz. * * 13-Feb-97 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University. * Creating from original S3 version. */#include "lm.h"#include "bio.h"#include "logs3.h"const char *darpa_hdr = "Darpa Trigram LM";/*ARCHAN, 20041112: NOP, NO STATIC VARIABLES! */static lm_t *lm_read_dump (char *file, float64 lw, float64 wip, float64 uw,int32 n_lmclass_used,lmclass_t *lmclass,int32 dict_size);int32 lm_get_classid (lm_t *model, char *name){ int32 i; if (! model->lmclass) return BAD_LMCLASSID; for (i = 0; i < model->n_lmclass; i++) { if (strcmp (lmclass_getname(model->lmclass[i]), name) == 0) return (i + LM_CLASSID_BASE); } return BAD_LMCLASSID;}int32 lm_delete (lm_t *lm,lmset_t *lmset){#if 0 int32 i; tginfo_t *tginfo, *next_tginfo; if (lm->fp) fclose (lm->fp); free (lm->ug); if (lm->n_bg > 0) { if (lm->bg) /* Memory-based; free all bg */ free (lm->bg); else { /* Disk-based; free in-memory bg */ for (i = 0; i < lm->n_ug; i++) if (lm->membg[i].bg) free (lm->membg[i].bg); free (lm->membg); } free (lm->bgprob); } if (lm->n_tg > 0) { if (lm->tg) /* Memory-based; free all tg */ free (lm->tg); for (i = 0; i < lm->n_ug; i++) { /* Free cached tg access info */ for (tginfo = lm->tginfo[i]; tginfo; tginfo = next_tginfo) { next_tginfo = tginfo->next; if ((! lm->tg) && tginfo->tg) /* Disk-based; free in-memory tg */ free (tginfo->tg); free (tginfo); } } free (lm->tginfo); free (lm->tgprob); free (lm->tgbowt); free (lm->tg_segbase); } for (i = 0; i < lm->n_ug; i++) free (lm->wordstr[i]); free (lm->wordstr); free (lm); free (lmset[i].name); for (; i < n_lm-1; i++) lmset[i] = lmset[i+1]; --n_lm; E_INFO("LM(\"%s\") deleted\n", name);#endif E_INFO("Warning, lm_delete is currently empty, no memory is deleted\n"); return (0);}/* Apply unigram weight; should be part of LM creation, but... */static void lm_uw (lm_t *lm, float64 uw){ int32 i, loguw, loguw_, loguniform, p1, p2; /* Interpolate unigram probs with uniform PDF, with weight uw */ loguw = logs3 (uw); loguw_ = logs3 (1.0 - uw); loguniform = logs3 (1.0/(lm->n_ug-1)); /* Skipping S3_START_WORD */ for (i = 0; i < lm->n_ug; i++) { if (strcmp (lm->wordstr[i], S3_START_WORD) != 0) { p1 = lm->ug[i].prob.l + loguw; p2 = loguniform + loguw_; lm->ug[i].prob.l = logs3_add (p1, p2); } }}static void lm2logs3 (lm_t *lm, float64 uw){ int32 i; for (i = 0; i < lm->n_ug; i++) { lm->ug[i].prob.l = log10_to_logs3 (lm->ug[i].prob.f); lm->ug[i].bowt.l = log10_to_logs3 (lm->ug[i].bowt.f); } lm_uw (lm, uw); for (i = 0; i < lm->n_bgprob; i++) lm->bgprob[i].l = log10_to_logs3 (lm->bgprob[i].f); if (lm->n_tg > 0) { for (i = 0; i < lm->n_tgprob; i++) lm->tgprob[i].l = log10_to_logs3 (lm->tgprob[i].f); for (i = 0; i < lm->n_tgbowt; i++) lm->tgbowt[i].l = log10_to_logs3 (lm->tgbowt[i].f); }}void lm_set_param (lm_t *lm, float64 lw, float64 wip){ int32 i, iwip; float64 f; if (lw <= 0.0) E_FATAL("lw = %e\n", lw); if (wip <= 0.0) E_FATAL("wip = %e\n", wip);#if 0 /* No lang weight on wip */ iwip = logs3(wip) * lw; #endif iwip = logs3(wip); f = lw / lm->lw; for (i = 0; i < lm->n_ug; i++) { lm->ug[i].prob.l = (int32)((lm->ug[i].prob.l - lm->wip) * f) + iwip; lm->ug[i].bowt.l = (int32)(lm->ug[i].bowt.l * f); } for (i = 0; i < lm->n_bgprob; i++) lm->bgprob[i].l = (int32)((lm->bgprob[i].l - lm->wip) * f) + iwip; if (lm->n_tg > 0) { for (i = 0; i < lm->n_tgprob; i++) lm->tgprob[i].l = (int32)((lm->tgprob[i].l - lm->wip) * f) + iwip; for (i = 0; i < lm->n_tgbowt; i++) lm->tgbowt[i].l = (int32)(lm->tgbowt[i].l * f); } lm->lw = (float32) lw; lm->wip = iwip;}static int32 lm_fread_int32 (lm_t *lm){ int32 val; if (fread (&val, sizeof(int32), 1, lm->fp) != 1) E_FATAL("fread failed\n"); if (lm->byteswap) SWAP_INT32(&val); return (val);}/* read in the LM control structure *//* 20040218 Arthur: This function is largely copied from Sphinx 2 because I don't want * to spend too much time in writing file reading routine. * I attached the comment in Sphinx 2 here. It specifies the restriction of the Darpa file format. **//* * Read control file describing multiple LMs, if specified. * File format (optional stuff is indicated by enclosing in []): * * [{ LMClassFileName LMClassFilename ... }] * TrigramLMFileName LMName [{ LMClassName LMClassName ... }] * TrigramLMFileName LMName [{ LMClassName LMClassName ... }] * ... * (There should be whitespace around the { and } delimiters.) * * This is an extension of the older format that had only TrigramLMFilenName * and LMName pairs. The new format allows a set of LMClass files to be read * in and referred to by the trigram LMs. (Incidentally, if one wants to use * LM classes in a trigram LM, one MUST use the -lmctlfn flag. It is not * possible to read in a class-based trigram LM using the -lmfn flag.) * * ARCHAN, */lmset_t* lm_read_ctl(char *ctlfile,dict_t* dict,float64 lw, float64 wip, float64 uw,char *lmdumpdir,int32* n_lm, int32* n_alloclm,int32 dict_size){ FILE *ctlfp; FILE *tmp; char lmfile[4096], lmname[4096], str[4096];
int32 isLM_IN_MEMORY;
lmclass_set_t lmclass_set; lmclass_t *lmclass, cl; int32 n_lmclass, n_lmclass_used; int32 i; lm_t *lm; lmset_t *lmset=NULL; tmp=NULL;
isLM_IN_MEMORY=0; lmclass_set = lmclass_newset(); E_INFO("Reading LM control file '%s'\n",ctlfile); if (cmd_ln_int32 ("-lminmemory")) isLM_IN_MEMORY = 1; else isLM_IN_MEMORY = 0; ctlfp = myfopen (ctlfile, "r"); if (fscanf (ctlfp, "%s", str) == 1) { if (strcmp (str, "{") == 0) { /* Load LMclass files */ while ((fscanf (ctlfp, "%s", str) == 1) && (strcmp (str, "}") != 0)) lmclass_set = lmclass_loadfile (lmclass_set, str); if (strcmp (str, "}") != 0) E_FATAL("Unexpected EOF(%s)\n", ctlfile); if (fscanf (ctlfp, "%s", str) != 1) str[0] = '\0'; } } else str[0] = '\0'; #if 0 tmp=myfopen("./tmp","w"); lmclass_set_dump(lmclass_set,tmp); fclose(tmp); #endif /* Fill in dictionary word id information for each LMclass word */ for (cl = lmclass_firstclass(lmclass_set); lmclass_isclass(cl); cl = lmclass_nextclass(lmclass_set, cl)) { /* For every words in the class, set the dictwid correctly The following piece of code replace s2's kb_init_lmclass_dictwid (cl); doesn't do any checking even the id is a bad dict id. This only sets the information in the lmclass_set, but not lm-2-dict or dict-2-lm map. In Sphinx 3, they are done in wid_dict_lm_map in wid.c. */ lmclass_word_t w; int32 wid; for (w = lmclass_firstword(cl); lmclass_isword(w); w = lmclass_nextword(cl, w)) { wid = dict_wordid (dict,lmclass_getword(w));#if 0 E_INFO("In class %s, Word %s, wid %d\n",cl->name,lmclass_getword(w),wid);#endif lmclass_set_dictwid (w, wid); } } /* At this point if str[0] != '\0', we have an LM filename */ n_lmclass = lmclass_get_nclass(lmclass_set); lmclass = (lmclass_t *) ckd_calloc (n_lmclass, sizeof(lmclass_t)); E_INFO("Number of LM class specified %d in file %s\n",n_lmclass,ctlfile); /* Read in one LM at a time */ while (str[0] != '\0') { strcpy (lmfile, str); if (fscanf (ctlfp, "%s", lmname) != 1) E_FATAL("LMname missing after LMFileName '%s'\n", lmfile); n_lmclass_used = 0; if (fscanf (ctlfp, "%s", str) == 1) { if (strcmp (str, "{") == 0) { while ((fscanf (ctlfp, "%s", str) == 1) && (strcmp (str, "}") != 0)) { if (n_lmclass_used >= n_lmclass){ E_FATAL("Too many LM classes specified for '%s'\n", lmfile); } lmclass[n_lmclass_used] = lmclass_get_lmclass (lmclass_set, str); if (! (lmclass_isclass(lmclass[n_lmclass_used]))) E_FATAL("LM class '%s' not found\n", str); n_lmclass_used++; } if (strcmp (str, "}") != 0) E_FATAL("Unexpected EOF(%s)\n", ctlfile); if (fscanf (ctlfp, "%s", str) != 1) str[0] = '\0'; } } else str[0] = '\0'; if (n_lmclass_used > 0){ /*ARCHAN DON'T do txt reading for a moment, just try to read the dmp file, bypass it for a moment. lm_read_clm(lmfile, lmname, lw,uw,wip, lmclass, lmset, dict, n_lmclass_used, lmdumpdir);*/ lm = (lm_t*) lm_read_dump (lmfile, lw, wip, uw, n_lmclass_used,lmclass,dict_size); /* Initialize the fast trigram cache, with all entries invalid */ lm->tgcache = (lm_tgcache_entry_t *) ckd_calloc(LM_TGCACHE_SIZE, sizeof(lm_tgcache_entry_t)); for (i = 0; i < LM_TGCACHE_SIZE; i++) lm->tgcache[i].lwid[0] = BAD_S3LMWID; } else{ /*Again, bypass this currently, lm_read_txt(lmfile, lmname,lw,uw,wip,lmset,dict,lmdumpdir);*/ lm = (lm_t*) lm_read_dump (lmfile, lw, wip, uw,0,NULL,dict_size); /* Initialize the fast trigram cache, with all entries invalid */ lm->tgcache = (lm_tgcache_entry_t *) ckd_calloc(LM_TGCACHE_SIZE, sizeof(lm_tgcache_entry_t)); for (i = 0; i < LM_TGCACHE_SIZE; i++) lm->tgcache[i].lwid[0] = BAD_S3LMWID; } if(*n_lm == *n_alloclm){ lmset= (lmset_t *) ckd_realloc(lmset,(*n_alloclm+16)*sizeof(lmset_t)); *n_alloclm+=16; } lmset[*n_lm].name = ckd_salloc(lmname); lmset[*n_lm].lm=lm; *n_lm+=1; } E_INFO("No. of LM set allocated %d, no. of LM %d \n",*n_alloclm,*n_lm); fclose (ctlfp); return lmset;}static int32 lm_build_lmclass_info(lm_t *lm,float64 lw, float64 uw, float64 wip,int32 n_lmclass_used,lmclass_t *lmclass){ int i; if(n_lmclass_used >0){ lm->lmclass=(lmclass_t*) ckd_calloc(n_lmclass_used,sizeof(lmclass_t)); for(i=0; i<n_lmclass_used ;i++) lm->lmclass[i]=lmclass[i]; }else lm->lmclass= NULL; lm->n_lmclass = n_lmclass_used; lm->inclass_ugscore = (int32*)ckd_calloc(lm->dict_size,sizeof(int32)); E_INFO("LM->inclass_ugscore size %d\n",lm->dict_size); E_INFO("Number of class used %d\n",n_lmclass_used); return 1;}/* * Read LM dump (<lmname>.DMP) file and make it the current LM. * Same interface as lm_read except that the filename refers to a .DMP file. */static lm_t *lm_read_dump (char *file, float64 lw, float64 wip, float64 uw,int32 n_lmclass_used, lmclass_t *lmclass,int32 dict_size){ lm_t *lm; int32 i, j, k, vn; char str[1024]; char *tmp_word_str; s3lmwid_t startwid, endwid; int32 isLM_IN_MEMORY=0; if (cmd_ln_int32 ("-lminmemory")) isLM_IN_MEMORY = 1; else isLM_IN_MEMORY = 0; lm = (lm_t *) ckd_calloc (1, sizeof(lm_t)); lm->dict_size=dict_size; if ((lm->fp = fopen (file, "rb")) == NULL) E_FATAL_SYSTEM("fopen(%s,rb) failed\n", file); /* Standard header string-size; set byteswap flag based on this */ if (fread (&k, sizeof(int32), 1, lm->fp) != 1) E_FATAL("fread(%s) failed\n", file); if ((size_t)k == strlen(darpa_hdr)+1) lm->byteswap = 0; else { SWAP_INT32(&k); if ((size_t)k == strlen(darpa_hdr)+1)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -