📄 cache_lm.c
字号:
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- *//* ==================================================================== * Copyright (c) 1999-2001 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * cache_lm.c -- Dynamic cache language model based on Roni Rosenfeld's work. * * HISTORY * * 01-Apr-97 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Started, based on earlier FBS6 version. */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <assert.h>#include <math.h>#include "s2types.h"#include "ckd_alloc.h"#include "basic_types.h"#include "search_const.h"#include "msd.h"#include "log.h"#include "cache_lm.h"#include "linklist.h"#include "list.h"#include "hash_table.h"#include "dict.h"#include "lmclass.h"#include "lm_3g.h"#include "kb.h"#include "err.h"static int32 log0;static int32 *log_count_tbl = NULL;#define LOG_COUNT_TBLSIZE 4096#define LOG_COUNT(x) (((x) >= LOG_COUNT_TBLSIZE) ? LOG((double)(x)) : log_count_tbl[(x)])/* * Cache LM initialize. */cache_lm_t *cache_lm_init(double ug_thresh, double min_uw, double max_uw, int32 nwd_uwrange, double bw){ cache_lm_t *lm; int32 i; lm = ckd_calloc(1, sizeof(cache_lm_t)); lm->ugprob_thresh = LOG(ug_thresh); lm->min_uw = min_uw; lm->max_uw = max_uw; lm->per_word_uw = (max_uw - min_uw) / (double) nwd_uwrange; lm->uw_ugcount_limit = nwd_uwrange; lm->uw = min_uw; lm->log_uw = LOG(min_uw); lm->bw = bw; lm->log_bw = LOG(bw); lm->log_remwt = LOG(1.0 - min_uw - bw); lm->n_word = kb_dict_maxsize(); lm->clm_ug = ckd_calloc(lm->n_word, sizeof(clm_ug_t)); lm->sum_ugcount = 0; log0 = LOG(0.0); if (!log_count_tbl) { log_count_tbl = ckd_calloc(LOG_COUNT_TBLSIZE, sizeof(int32)); for (i = 0; i < LOG_COUNT_TBLSIZE; i++) log_count_tbl[i] = LOG((double) i); } return lm;}voidcache_lm_reset(cache_lm_t * lm){ clm_bg_t *bg, *nextbg; int32 i; for (i = 0; i < lm->n_word; i++) { for (bg = lm->clm_ug[i].w2list; bg; bg = nextbg) { nextbg = bg->next; listelem_free(bg, sizeof(clm_bg_t)); } lm->clm_ug[i].w2list = NULL; lm->clm_ug[i].count = 0; lm->clm_ug[i].sum_w2count = 0; } lm->sum_ugcount = 0; lm->log_uw = LOG(lm->min_uw); lm->log_remwt = LOG(1.0 - lm->min_uw - lm->bw);}voidcache_lm_add_ug(cache_lm_t * lm, int32 w){ lm->clm_ug[w].count++; lm->sum_ugcount++; if (lm->sum_ugcount > lm->uw_ugcount_limit) return; /* Still within linear uw region; update unigram cache weight */ lm->uw += lm->per_word_uw; lm->log_uw = LOG(lm->uw); lm->log_remwt = LOG(1.0 - lm->uw - lm->bw);}voidcache_lm_add_bg(cache_lm_t * lm, int32 w1, int32 w2){ clm_bg_t *bg; /* Find cache LM bigram entry for w1,w2 */ for (bg = lm->clm_ug[w1].w2list; bg && (bg->w2 != w2); bg = bg->next); if (!bg) { /* First encounter of w1,w2 */ bg = (clm_bg_t *) listelem_alloc(sizeof(clm_bg_t)); bg->w2 = w2; bg->count = 1; bg->next = lm->clm_ug[w1].w2list; lm->clm_ug[w1].w2list = bg; } else bg->count++; lm->clm_ug[w1].sum_w2count++;}/* NOTE: Some approximations in the way the relative language weights are applied */int32cache_lm_score(cache_lm_t * lm, int32 w1, int32 w2, int32 * remwt){ int32 bgscr, ugscr, clmscr; clm_bg_t *bg; /* Unigram cache component */ if (lm->clm_ug[w2].count > 0) ugscr = LOG_COUNT(lm->clm_ug[w2].count) - LOG_COUNT(lm->sum_ugcount); else ugscr = log0; ugscr += lm->log_uw; /* Bigram cache component */ for (bg = lm->clm_ug[w1].w2list; bg && (bg->w2 != w2); bg = bg->next); if (bg) bgscr = LOG_COUNT(bg->count) - LOG_COUNT(lm->clm_ug[w1].sum_w2count); else bgscr = log0; bgscr += lm->log_bw; /* Combine unigram and bigram cache component scores */ if ((ugscr > log0) || (bgscr > log0)) { FAST_ADD(clmscr, ugscr, bgscr, fe_logadd_table, fe_logadd_table_size); } else clmscr = log0; *remwt = lm->log_remwt; return (clmscr);}voidcache_lm_dump(cache_lm_t * lm, char *file){ FILE *fp; int32 i; clm_bg_t *bg; if ((fp = fopen(file, "w")) == NULL) { E_ERROR("fopen(%s,w) failed\n", file); return; } fprintf(fp, "#CacheLMDump\n"); fprintf(fp, "#Unigrams\n"); for (i = 0; i < lm->n_word; i++) { if (lm->clm_ug[i].count > 0) fprintf(fp, "%d %s\n", lm->clm_ug[i].count, kb_get_word_str(i)); } fprintf(fp, "#Bigrams\n"); for (i = 0; i < lm->n_word; i++) { for (bg = lm->clm_ug[i].w2list; bg; bg = bg->next) { fprintf(fp, "%d %s %s\n", bg->count, kb_get_word_str(i), kb_get_word_str(bg->w2)); } } fprintf(fp, "#End\n"); fclose(fp);}voidcache_lm_load(cache_lm_t * lm, char *file){ FILE *fp; int32 i, n, w, w2; char line[16384], wd[4096], wd2[4096]; if ((fp = fopen(file, "r")) == NULL) { E_ERROR("fopen(%s,r) failed\n", file); return; } if (fgets(line, sizeof(line), fp) == NULL) { E_ERROR("%s: No header\n", file); fclose(fp); return; } if (strcmp(line, "#CacheLMDump\n") != 0) { E_ERROR("%s: Bad header line: %s\n", file, line); fclose(fp); return; } if ((fgets(line, sizeof(line), fp) == NULL) || (strcmp(line, "#Unigrams\n") != 0)) { E_ERROR("%s: Missing #Unigrams keyword\n", file); fclose(fp); return; } while (fgets(line, sizeof(line), fp) != NULL) { if (sscanf(line, "%d %s", &n, wd) != 2) break; w = kb_get_word_id(wd); if ((w < 0) || (w >= lm->n_word)) { E_ERROR("%s: Unknown word(%s); ignored\n", file, wd); continue; } for (i = 0; i < n; i++) cache_lm_add_ug(lm, w); } if (strcmp(line, "#Bigrams\n") != 0) { E_ERROR("%s: Missing #Bigrams keyword: %s\n", file, line); fclose(fp); return; } while (fgets(line, sizeof(line), fp) != NULL) { if (sscanf(line, "%d %s %s", &n, wd, wd2) != 3) break; w = kb_get_word_id(wd); w2 = kb_get_word_id(wd2); if ((w < 0) || (w >= lm->n_word)) { E_ERROR("%s: Unknown word(%s); ignored\n", file, wd); continue; } if ((w2 < 0) || (w2 >= lm->n_word)) { E_ERROR("%s: Unknown word(%s); ignored\n", file, wd2); continue; } for (i = 0; i < n; i++) cache_lm_add_bg(lm, w, w2); } if (strcmp(line, "#End\n") != 0) E_ERROR("%s: Missing #End keyword: %s\n", file, line); fclose(fp);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -