cache_lm.c

来自「WinCE平台上的语音识别程序」· C语言代码 · 共 316 行
316 行
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- *//* ==================================================================== * Copyright (c) 1999-2001 Carnegie Mellon University.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer.  * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * This work was supported in part by funding from the Defense Advanced  * Research Projects Agency and the National Science Foundation of the  * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * cache_lm.c -- Dynamic cache language model based on Roni Rosenfeld's work. * * HISTORY *  * 01-Apr-97	M K Ravishankar (rkm@cs) at Carnegie Mellon University * 		Started, based on earlier FBS6 version. */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <assert.h>#include <math.h>#include "s2types.h"#include "ckd_alloc.h"#include "basic_types.h"#include "search_const.h"#include "msd.h"#include "log.h"#include "cache_lm.h"#include "linklist.h"#include "list.h"#include "hash_table.h"#include "dict.h"#include "lmclass.h"#include "lm_3g.h"#include "kb.h"#include "err.h"static int32 log0;static int32 *log_count_tbl = NULL;#define LOG_COUNT_TBLSIZE	4096#define LOG_COUNT(x) (((x) >= LOG_COUNT_TBLSIZE) ? LOG((double)(x)) : log_count_tbl[(x)])/* * Cache LM initialize. */cache_lm_t *cache_lm_init(double ug_thresh,              double min_uw, double max_uw, int32 nwd_uwrange, double bw){    cache_lm_t *lm;    int32 i;    lm = ckd_calloc(1, sizeof(cache_lm_t));    lm->ugprob_thresh = LOG(ug_thresh);    lm->min_uw = min_uw;    lm->max_uw = max_uw;    lm->per_word_uw = (max_uw - min_uw) / (double) nwd_uwrange;    lm->uw_ugcount_limit = nwd_uwrange;    lm->uw = min_uw;    lm->log_uw = LOG(min_uw);    lm->bw = bw;    lm->log_bw = LOG(bw);    lm->log_remwt = LOG(1.0 - min_uw - bw);    lm->n_word = kb_dict_maxsize();    lm->clm_ug = ckd_calloc(lm->n_word, sizeof(clm_ug_t));    lm->sum_ugcount = 0;    log0 = LOG(0.0);    if (!log_count_tbl) {        log_count_tbl = ckd_calloc(LOG_COUNT_TBLSIZE, sizeof(int32));        for (i = 0; i < LOG_COUNT_TBLSIZE; i++)            log_count_tbl[i] = LOG((double) i);    }    return lm;}voidcache_lm_reset(cache_lm_t * lm){    clm_bg_t *bg, *nextbg;    int32 i;    for (i = 0; i < lm->n_word; i++) {        for (bg = lm->clm_ug[i].w2list; bg; bg = nextbg) {            nextbg = bg->next;            listelem_free(bg, sizeof(clm_bg_t));        }        lm->clm_ug[i].w2list = NULL;        lm->clm_ug[i].count = 0;        lm->clm_ug[i].sum_w2count = 0;    }    lm->sum_ugcount = 0;    lm->log_uw = LOG(lm->min_uw);    lm->log_remwt = LOG(1.0 - lm->min_uw - lm->bw);}voidcache_lm_add_ug(cache_lm_t * lm, int32 w){    lm->clm_ug[w].count++;    lm->sum_ugcount++;    if (lm->sum_ugcount > lm->uw_ugcount_limit)        return;    /* Still within linear uw region; update unigram cache weight */    lm->uw += lm->per_word_uw;    lm->log_uw = LOG(lm->uw);    lm->log_remwt = LOG(1.0 - lm->uw - lm->bw);}voidcache_lm_add_bg(cache_lm_t * lm, int32 w1, int32 w2){    clm_bg_t *bg;    /* Find cache LM bigram entry for w1,w2 */    for (bg = lm->clm_ug[w1].w2list; bg && (bg->w2 != w2); bg = bg->next);    if (!bg) {        /* First encounter of w1,w2 */        bg = (clm_bg_t *) listelem_alloc(sizeof(clm_bg_t));        bg->w2 = w2;        bg->count = 1;        bg->next = lm->clm_ug[w1].w2list;        lm->clm_ug[w1].w2list = bg;    }    else        bg->count++;    lm->clm_ug[w1].sum_w2count++;}/* NOTE: Some approximations in the way the relative language weights are applied */int32cache_lm_score(cache_lm_t * lm, int32 w1, int32 w2, int32 * remwt){    int32 bgscr, ugscr, clmscr;    clm_bg_t *bg;    /* Unigram cache component */    if (lm->clm_ug[w2].count > 0)        ugscr =            LOG_COUNT(lm->clm_ug[w2].count) - LOG_COUNT(lm->sum_ugcount);    else        ugscr = log0;    ugscr += lm->log_uw;    /* Bigram cache component */    for (bg = lm->clm_ug[w1].w2list; bg && (bg->w2 != w2); bg = bg->next);    if (bg)        bgscr =            LOG_COUNT(bg->count) - LOG_COUNT(lm->clm_ug[w1].sum_w2count);    else        bgscr = log0;    bgscr += lm->log_bw;    /* Combine unigram and bigram cache component scores */    if ((ugscr > log0) || (bgscr > log0)) {        FAST_ADD(clmscr, ugscr, bgscr, fe_logadd_table, fe_logadd_table_size);    }    else        clmscr = log0;    *remwt = lm->log_remwt;    return (clmscr);}voidcache_lm_dump(cache_lm_t * lm, char *file){    FILE *fp;    int32 i;    clm_bg_t *bg;    if ((fp = fopen(file, "w")) == NULL) {        E_ERROR("fopen(%s,w) failed\n", file);        return;    }    fprintf(fp, "#CacheLMDump\n");    fprintf(fp, "#Unigrams\n");    for (i = 0; i < lm->n_word; i++) {        if (lm->clm_ug[i].count > 0)            fprintf(fp, "%d %s\n", lm->clm_ug[i].count,                    kb_get_word_str(i));    }    fprintf(fp, "#Bigrams\n");    for (i = 0; i < lm->n_word; i++) {        for (bg = lm->clm_ug[i].w2list; bg; bg = bg->next) {            fprintf(fp, "%d %s %s\n", bg->count,                    kb_get_word_str(i), kb_get_word_str(bg->w2));        }    }    fprintf(fp, "#End\n");    fclose(fp);}voidcache_lm_load(cache_lm_t * lm, char *file){    FILE *fp;    int32 i, n, w, w2;    char line[16384], wd[4096], wd2[4096];    if ((fp = fopen(file, "r")) == NULL) {        E_ERROR("fopen(%s,r) failed\n", file);        return;    }    if (fgets(line, sizeof(line), fp) == NULL) {        E_ERROR("%s: No header\n", file);        fclose(fp);        return;    }    if (strcmp(line, "#CacheLMDump\n") != 0) {        E_ERROR("%s: Bad header line: %s\n", file, line);        fclose(fp);        return;    }    if ((fgets(line, sizeof(line), fp) == NULL) ||        (strcmp(line, "#Unigrams\n") != 0)) {        E_ERROR("%s: Missing #Unigrams keyword\n", file);        fclose(fp);        return;    }    while (fgets(line, sizeof(line), fp) != NULL) {        if (sscanf(line, "%d %s", &n, wd) != 2)            break;        w = kb_get_word_id(wd);        if ((w < 0) || (w >= lm->n_word)) {            E_ERROR("%s: Unknown word(%s); ignored\n", file, wd);            continue;        }        for (i = 0; i < n; i++)            cache_lm_add_ug(lm, w);    }    if (strcmp(line, "#Bigrams\n") != 0) {        E_ERROR("%s: Missing #Bigrams keyword: %s\n", file, line);        fclose(fp);        return;    }    while (fgets(line, sizeof(line), fp) != NULL) {        if (sscanf(line, "%d %s %s", &n, wd, wd2) != 3)            break;        w = kb_get_word_id(wd);        w2 = kb_get_word_id(wd2);        if ((w < 0) || (w >= lm->n_word)) {            E_ERROR("%s: Unknown word(%s); ignored\n", file, wd);            continue;        }        if ((w2 < 0) || (w2 >= lm->n_word)) {            E_ERROR("%s: Unknown word(%s); ignored\n", file, wd2);            continue;        }        for (i = 0; i < n; i++)            cache_lm_add_bg(lm, w, w2);    }    if (strcmp(line, "#End\n") != 0)        E_ERROR("%s: Missing #End keyword: %s\n", file, line);    fclose(fp);}
cache_lm.c - 源码说明

本页面展示了「WinCE平台上的语音识别程序」中的 cache_lm.c 源码文件，采用 C语言编程语言编写，共 316 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与WinCE相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?