📄 lm_3g.c
字号:
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- *//* ==================================================================== * Copyright (c) 1999-2001 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * * lm_3g.c -- Darpa Trigram LM module. * * HISTORY * * 28-Oct-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added lm3g_access_type() and necessary support. * * 15-Oct-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Bugfix: inclass_ugscore[lw3] changed to inclass_ugscore[w3] in * lm3g_tg_score(). (Thanks to dbansal@cs.) * * 15-Jul-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Corrected references to unigram_t.wid to unigram_t.mapid. * * 14-Apr-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added lm3g_n_lm() and lm3g_index2name(). * * 03-Apr-97 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added lm3g_raw_score() and lm_t.invlw. * Changed a number of function names from lm_... to lm3g_... * * 09-Jan-97 M K Ravishankar (rkm@cs) at Carnegie Mellon University * BUGFIX: Added check for lmp->unigrams[i].wid in lm_set_current(). * * 06-Dec-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Changed function name lmname_to_lm() to lm_name2lm(). * * 06-Dec-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Changed function name get_current_lm to lm_get_current. * Changed check for already existing word in lm_add_word, and added * condition to updating dictwid_map. * * 01-Jul-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Removed LM cache and replaced with find_bg and find_tg within the main * bigrams and trigram structures. No loss of speed and uses less memory. * * 24-Jun-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Fixed a number of memory leaks while deleting an LM. Added the global * dictwid_map, and allocated it once and never freed. Made sure lm_cache * is created only once. * * 14-Jun-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Modified lm_read to return 0 on success, and to delete any existing LM * with the new LM name (instead of reporting error and exiting). * Added backslash option in building filenames (for PC compatibility). * * Revision 8.9 94/10/11 12:36:28 rkm * Changed lm_tg_score to call lm_bg_score if no trigrams present or * the first word is invalid. * * Revision 8.8 94/07/29 11:54:23 rkm * Renamed lmSetParameters to lm_set_param and moved it into lm_add(). * Added functions lm_init_oov() to create initial list of OOVs, * and lm_add_word() to add new OOV at run time. * * Revision 8.7 94/05/19 14:19:59 rkm * Rewrote LM cache code for greater efficiency. * * Revision 8.6 94/05/10 10:47:58 rkm * Added lm_add() and lm_set_param() functions, for dynamically adding a new * in-memory LM to the set of available LMs. * * Revision 8.5 94/04/22 13:53:27 rkm * Added query_lm_cache_lines() to allow run-time spec of #cache lines. * * Revision 8.4 94/04/14 15:08:46 rkm * Added function lm_delete() to delete a named LM and reclaim space. * * Revision 8.3 94/04/14 14:40:27 rkm * Minor changes. * * Revision 8.1 94/02/15 15:09:22 rkm * Derived from v7. Includes multiple LMs for grammar switching. * * Revision 6.13 94/02/11 13:14:45 rkm * Added bigram and trigram multi-line caches, and functions, for v7. * Replaced sequential search in wstr2wid() with hash_lookup(). * * Revision 6.12 94/01/07 10:56:16 rkm * Corrected bug relating to input file format. * * Revision 6.11 93/12/17 13:14:52 rkm * *** empty log message *** * * Revision 6.10 93/12/03 17:09:59 rkm * Added ability to handle bigram-only dump files. * Added <s> </s> bigram -> MIN_PROB. * Added timestamp to dump files. * * Revision 6.9 93/12/01 12:29:55 rkm * Added ability to handle LM files containing only bigrams. * Excluded start_sym from interpolation of unigram prob with uniform prob. * * * 93/10/21 rkm@cs.cmu.edu * Added <c.h> * * Revision 6.6 93/10/19 18:58:10 rkm * Added code to change bigram-prob(<s>,<s>) to very low value. The * Darpa LM file contains a spurious value to be ignored. * Fixed bug that dumps one trigram entry too many. * * Revision 6.5 93/10/15 15:00:14 rkm * * Revision 6.4 93/10/13 16:56:04 rkm * Added LM cache line stats. * Added bg_only option for lm_read parameter list * (but not yet implemented). * Changed proc name ilm_LOG_prob_of to lm3g_prob, to avoid conflict * with Roni's ILM function of the same name. * * Revision 6.3 93/10/09 17:01:55 rkm * M K Ravishankar (rkm@cs) at Carnegie Mellon * Cleaned up handling precompiled binary 3g LM file, * Added SWAP option for HP platforms. * * Revision 6.2 93/10/06 11:08:15 rkm * M K Ravishankar (rkm@cs) at Carnegie Mellon University * Darpa Trigram LM module. Created. */#include <stdio.h>#include <string.h>#include <stdlib.h>#include "s2types.h"#include "s2io.h"#include "ckd_alloc.h"#include "cmd_ln.h"#include "pio.h"#include "basic_types.h"#include "assert.h"#include "strfuncs.h"#include "linklist.h"#include "list.h"#include "hash_table.h"#include "err.h"#include "lmclass.h"#include "lm_3g.h"#include "log.h"#include "search_const.h"#include "msd.h"#include "dict.h"#include "kb.h"#include "fbs.h"#include "byteorder.h"#include "dict.h"#define UG_MAPID(m,u) ((m)->unigrams[u].mapid)#define UG_PROB_F(m,u) ((m)->unigrams[u].prob1.f)#define UG_BO_WT_F(m,u) ((m)->unigrams[u].bo_wt1.f)#define UG_PROB_L(m,u) ((m)->unigrams[u].prob1.l)#define UG_BO_WT_L(m,u) ((m)->unigrams[u].bo_wt1.l)#define FIRST_BG(m,u) ((m)->unigrams[u].bigrams)#define LAST_BG(m,u) (FIRST_BG((m),(u)+1)-1)#define BG_WID(m,b) ((m)->bigrams[b].wid)#define BG_PROB_F(m,b) ((m)->prob2[(m)->bigrams[b].prob2].f)#define BG_BO_WT_F(m,b) ((m)->bo_wt2[(m)->bigrams[b].bo_wt2].f)#define BG_PROB_L(m,b) ((m)->prob2[(m)->bigrams[b].prob2].l)#define BG_BO_WT_L(m,b) ((m)->bo_wt2[(m)->bigrams[b].bo_wt2].l)#define TSEG_BASE(m,b) ((m)->tseg_base[(b)>>LOG_BG_SEG_SZ])#define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->bigrams[b].trigrams))#define LAST_TG(m,b) (FIRST_TG((m),(b)+1)-1)#define TG_WID(m,t) ((m)->trigrams[t].wid)#define TG_PROB_F(m,t) ((m)->prob3[(m)->trigrams[t].prob3].f)#define TG_PROB_L(m,t) ((m)->prob3[(m)->trigrams[t].prob3].l)static double oov_ugprob = -5.0; /* Actually, logprob */static char const *start_sym = "<s>";static char const *end_sym = "</s>";static char const *darpa_hdr = "Darpa Trigram LM";/* FIXME: Why does lm3g2dmp.c have its own versions of these functions? */static int32 lmname_to_id(char const *name);static int32 get_dict_size(int32 *inout_n_unigram, char const *lmname);static int32 lmtext_load(char const *filename, char const *lmname, lm_t **out_model);static int32 lm3g_load(char const *file, char const *lmname, lm_t **out_model, char const *lmfile);static int32 lm3g_dump(char const *file, lm_t * model, char const *lmfile);static void lm_set_param(lm_t * model, double lw, double uw, double wip, int32 word_pair);/* Structure for maintaining multiple, named LMs */static struct lmset_s { char *name; lm_t *lm;} *lmset;static int32 n_lm = 0; /* Total #LMs (actual) */static int32 n_lm_alloc = 0; /* Total #LMs (for which space has been allocated) *//* The currently active LM */static lm_t *lmp;/* Words in LM; used only for building internal LM from LM file */static char **word_str;static int32 lm_last_access_type; /* Hack!! See ACCESS definitions in .h */#define MIN_PROB_F -99.0#define MAX_SORTED_ENTRIES 65534/* Base values for ranges of unigram_t.mapid */#define LM_DICTWID_BASE 0 /* Do not change this */#define LM_CLASSID_BASE 0x01000000 /* Interpreted as LMclass ID */#define LM_DICTWID_BADMAP -16000 /* An illegal mapping */#define LM_CLASSID_TO_CLASS(m,i) ((model)->lmclass[(i)-LM_CLASSID_BASE])/* * Bigram probs and bo-wts, and trigram probs are kept in separate tables * rather than within the bigram_t and trigram_t structures. These tables * hold unique prob and bo-wt values, and can be < 64K long (see lm_3g.h). * The following tree structure is used to construct these tables of unique * values. Whenever a new value is read from the LM file, the sorted tree * structure is searched to see if the value already exists, and inserted * if not found. */typedef struct sorted_entry_s { log_t val; /* value being kept in this node */ uint16 lower; /* index of another entry. All descendants down this path have their val < this node's val. 0 => no son exists (0 is root index) */ uint16 higher; /* index of another entry. All descendants down this path have their val > this node's val 0 => no son exists (0 is root index) */} sorted_entry_t;/* * The sorted list. list is a (64K long) array. The first entry is the * root of the tree and is created during initialization. */typedef struct { sorted_entry_t *list; int32 free; /* first free element in list */} sorted_list_t;/* Arrays of unique bigram probs and bo-wts, and trigram probs */static sorted_list_t sorted_prob2;static sorted_list_t sorted_bo_wt2;static sorted_list_t sorted_prob3;/* * Initialize sorted list with the 0-th entry = MIN_PROB_F, which may be needed * to replace spurious values in the Darpa LM file. */static voidinit_sorted_list(sorted_list_t * l){ l->list = ckd_calloc(MAX_SORTED_ENTRIES, sizeof(sorted_entry_t)); l->list[0].val.f = MIN_PROB_F; l->list[0].lower = 0; l->list[0].higher = 0; l->free = 1;}static voidfree_sorted_list(sorted_list_t * l){ free(l->list);}static log_t *vals_in_sorted_list(sorted_list_t * l){ log_t *vals; int32 i; vals = ckd_calloc(l->free, sizeof(log_t)); for (i = 0; i < l->free; i++) vals[i].f = l->list[i].val.f; return (vals);}static int32sorted_id(sorted_list_t * l, float *val){ int32 i = 0; for (;;) { if (*val == l->list[i].val.f) return (i); if (*val < l->list[i].val.f) { if (l->list[i].lower == 0) { if (l->free >= MAX_SORTED_ENTRIES) E_FATAL("sorted list overflow\n"); l->list[i].lower = l->free; (l->free)++; i = l->list[i].lower; l->list[i].val.f = *val; return (i); } else i = l->list[i].lower; } else { if (l->list[i].higher == 0) { if (l->free >= MAX_SORTED_ENTRIES) E_FATAL("sorted list overflow\n"); l->list[i].higher = l->free; (l->free)++; i = l->list[i].higher; l->list[i].val.f = *val; return (i); } else i = l->list[i].higher; } }}/* * allocate, initialize and return pointer to an array of unigram entries. */static unigram_t *NewUnigramTable(int32 n_ug){ unigram_t *table; int32 i; table = ckd_calloc(n_ug, sizeof(unigram_t)); for (i = 0; i < n_ug; i++) { table[i].mapid = NO_WORD; table[i].prob1.f = -99.0; table[i].bo_wt1.f = -99.0; } return table;}/* * returns a pointer to a new language model record. The size is passed in * as a parameter. */lm_t *NewModel(int32 n_ug, int32 n_bg, int32 n_tg, int32 n_dict){ lm_t *model; model = ckd_calloc(1, sizeof(lm_t)); /* Only allocate the stuff that isn't done elsewhere */ model->unigrams = NewUnigramTable(n_ug + 1); model->dictwid_map = ckd_calloc(n_dict, sizeof(int32)); model->max_ucount = model->ucount = n_ug; model->bcount = n_bg; model->tcount = n_tg; model->dict_size = n_dict; model->HT = hash_table_new(n_ug, HASH_CASE_NO); return model;}static int32wstr2wid(lm_t * model, char *w){ void *val; if (hash_table_lookup(model->HT, w, &val) != 0) return NO_WORD; return ((int32) val);}/* * Read and return #unigrams, #bigrams, #trigrams as stated in input file. */static voidReadNgramCounts(FILE * fp, int32 * n_ug, int32 * n_bg, int32 * n_tg){ char string[256]; int32 ngram, ngram_cnt; /* skip file until past the '\data\' marker */ do
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -