lm_3g.c

来自「WinCE平台上的语音识别程序」· C语言代码 · 共 2,105 行 · 第 1/5 页
2,105 行
        sprintf(dumpfile, "%s/%s.DMP", kbdumpdir, filename + i);    /*     * Allocate one extra unigram and bigram entry: sentinels to terminate     * followers (bigrams and trigrams, respectively) of previous entry.     */    model->bigrams =        ckd_calloc(n_bigram + 1, sizeof(bigram_t));    if (n_trigram > 0)        model->trigrams =            ckd_calloc(n_trigram, sizeof(trigram_t));    if (n_trigram > 0) {        model->tseg_base =            ckd_calloc((n_bigram + 1) / BG_SEG_SZ + 1,                       sizeof(int32));#if 0        E_INFO("%8d = tseg_base entries allocated\n",               (n_bigram + 1) / BG_SEG_SZ + 1);#endif    }    ReadUnigrams(fp, model);    E_INFO("%8d = #unigrams created\n", model->ucount);    init_sorted_list(&sorted_prob2);    if (model->tcount > 0)        init_sorted_list(&sorted_bo_wt2);    ReadBigrams(fp, model, idfmt);    model->bcount = FIRST_BG(model, model->ucount);    model->n_prob2 = sorted_prob2.free;    model->prob2 = vals_in_sorted_list(&sorted_prob2);    free_sorted_list(&sorted_prob2);    E_INFO("\n%8d = #bigrams created\n", model->bcount);    E_INFO("%8d = #prob2 entries\n", model->n_prob2);    if (model->tcount > 0) {        /* Create trigram bo-wts array */        model->n_bo_wt2 = sorted_bo_wt2.free;        model->bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2);        free_sorted_list(&sorted_bo_wt2);        E_INFO("%8d = #bo_wt2 entries\n", model->n_bo_wt2);        init_sorted_list(&sorted_prob3);        ReadTrigrams(fp, model, idfmt);        model->tcount = FIRST_TG(model, model->bcount);        model->n_prob3 = sorted_prob3.free;        model->prob3 = vals_in_sorted_list(&sorted_prob3);        E_INFO("\n%8d = #trigrams created\n", model->tcount);        E_INFO("%8d = #prob3 entries\n", model->n_prob3);        free_sorted_list(&sorted_prob3);    }    /* Now dump this file if requrested. */    /* HACK!! to avoid unnecessarily creating dump files for small LMs */    if (kbdumpdir && (model->bcount + model->tcount > 200000))        lm3g_dump(dumpfile, model, filename);    if (usingPipe) {#if defined(_WIN32_WCE) || defined(GNUWINCE)        E_FATAL("No popen() on WinCE!\n");#else                           /* !GNUWINCE */#ifdef WIN32        _pclose(fp);#else        pclose(fp);#endif                          /* !WIN32 */#endif                          /* !GNUWINCE */    }    else        fclose(fp);    *out_model = model;    return 0;}/* * Read in a trigram language model from the given file.  The LM tokens can be word * classes.  However a given actual word can belong to AT MOST ONE of the LM classes * used by this LM. */int32lm_read_clm(char const *filename,            char const *lmname,            double lw,            double uw, double wip, lmclass_t * lmclass, int32 n_lmclass){    lm_t *model;    int32 i, j, k, last_bg, last_tg;    int32 dictid, classid, notindict, maperr;    lmclass_word_t lmclass_word;    int do_mmap;    E_INFO("Reading LM file %s (name \"%s\")\n", filename, lmname);    do_mmap = cmd_ln_boolean("-mmap");    /* Make sure no LM with same lmname already exists; if so, delete it */    if (lmname_to_id(lmname) >= 0)        lm_delete(lmname);    /* Try to read it as a dump file. */    if (lm3g_load(filename, lmname, &model, filename) < 0) {        if (lmtext_load(filename, lmname, &model) < 0) {            E_FATAL("Failed to load LM (text or dump format) from %s\n", filename);        }    }    lmp = model;    /*     * Make a local copy of the LM Classes used by this LM.  The unigrams_t.mapid     * field can refer to the index of the local copy array.     */    if (n_lmclass > 0) {        model->lmclass =            ckd_calloc(n_lmclass, sizeof(lmclass_t));        for (i = 0; i < n_lmclass; i++)            model->lmclass[i] = lmclass[i];    }    else        model->lmclass = NULL;    model->n_lmclass = n_lmclass;    model->inclass_ugscore = ckd_calloc(model->dict_size, sizeof(int32));    /*     * Create mapping from dictionary ID to unigram index.  And also mapping for     * LM wids (to dictionary or LMclass ids) in unigram array.     */    for (i = 0; i < model->dict_size; i++)        model->dictwid_map[i] = LM_DICTWID_BADMAP;    notindict = 0;    maperr = 0;    for (i = 0; i < model->ucount; i++) {        model->unigrams[i].mapid = kb_get_word_id(word_str[i]);        classid = lm_get_classid(model, word_str[i]);        if (model->unigrams[i].mapid >= 0) {    /* unigram[i] is a dictionary word */            if (classid >= 0) {                E_ERROR("'%s' is both a word and an LM class name\n",                        word_str[i]);                maperr = 1;            }            else                model->dictwid_map[model->unigrams[i].mapid] = i;        }        else {            if (classid >= 0) { /* unigram[i] is an LM class */                model->unigrams[i].mapid = classid;                /* Create entries in dictwid_map for each word in the class */                lmclass_word =                    lmclass_firstword(LM_CLASSID_TO_CLASS(model, classid));                while (lmclass_isword(lmclass_word)) {                    dictid = lmclass_getwid(lmclass_word);                    if (dictid >= 0) {                        if (model->dictwid_map[dictid] >= 0) {                            E_ERROR("Multiple mappings of '%s' in LM\n",                                    lmclass_getword(lmclass_word));                            maperr = 1;                        }                        else {                            model->dictwid_map[dictid] = i;                            model->inclass_ugscore[dictid] =                                lmclass_getprob(lmclass_word) * lw;                        }                    }		    else {			E_ERROR("'%s' is in LM class definition but not in dictionary\n",				lmclass_getword(lmclass_word));			notindict++;		    }                    lmclass_word =                        lmclass_nextword(LM_CLASSID_TO_CLASS                                         (model, classid), lmclass_word);                }            }            else {		E_ERROR("'%s' is in LM unigrams but not in dictionary\n", word_str[i]);		notindict++;	    }        }    }    if (maperr)        E_FATAL("Errors in LM; exiting\n");    if (notindict > 0)        E_WARN("%d LM words not in dict; ignored\n", notindict);    /*     * Discourage expansion of end_sym and transition to start_sym.  (The given     * Darpa LM may contain some spurious values that don't reflect these     * requirements.)     */    /* bo_wt(</s>) = MIN_PROB_F */    for (i = 0; (i < model->ucount) && (strcmp(word_str[i], end_sym) != 0);         i++);    E_INFO("bo_wt(%s) changed from %.4f to %.4f\n", word_str[i],           model->unigrams[i].bo_wt1.f, MIN_PROB_F);    model->unigrams[i].bo_wt1.f = MIN_PROB_F;    /* unigram prob(<s>) = MIN_PROB_F */    for (i = 0;         (i < model->ucount) && (strcmp(word_str[i], start_sym) != 0);         i++);    E_INFO("prob(%s) changed from %.4f to %.4f\n", word_str[i],           model->unigrams[i].prob1.f, MIN_PROB_F);    model->unigrams[i].prob1.f = MIN_PROB_F;    /* bigram prob(<s>,<s>) = MIN_PROB_F (if bigram exists) */    j = FIRST_BG(model, i);    last_bg = LAST_BG(model, i);    for (; (j <= last_bg)         && (strcmp(word_str[BG_WID(model, j)], start_sym) != 0); j++);    if (j <= last_bg) {        E_INFO("prob(%s,%s) changed from %.4f to %.4f\n",               word_str[i], word_str[BG_WID(model, j)],               model->prob2[model->bigrams[j].prob2].f, model->prob2[0].f);        if (!do_mmap)            model->bigrams[j].prob2 = 0;        if (model->tcount > 0) {            /* trigram prob(<s>,<s>,<s>) = MIN_PROB_F (if trigram exists) */            k = FIRST_TG(model, j);            last_tg = LAST_TG(model, j);            for (; k <= last_tg; k++) {                if (strcmp(word_str[TG_WID(model, k)], start_sym) == 0)                    break;            }            if (k <= last_tg) {                E_INFO("prob(%s,%s,%s) changed from %.4f to %.4f\n",                       word_str[i], word_str[BG_WID(model, j)],                       word_str[TG_WID(model, k)],                       model->prob3[model->trigrams[k].prob3].f,                       model->prob3[0].f);                if (!do_mmap)                    model->trigrams[k].prob3 = 0;            }        }    }    /* bigram prob(<s>,</s>) = MIN_PROB_F (if bigram exists) */    j = FIRST_BG(model, i);    last_bg = LAST_BG(model, i);    for (; (j <= last_bg)         && (strcmp(word_str[BG_WID(model, j)], end_sym) != 0); j++);    if (j <= last_bg) {        E_INFO("prob(%s,%s) changed from %.4f to %.4f\n",               word_str[i], word_str[BG_WID(model, j)],               model->prob2[model->bigrams[j].prob2].f, model->prob2[0].f);        if (!do_mmap)            model->bigrams[j].prob2 = 0;    }    lm_add(lmname, model, lw, uw, wip);    hash_table_free(model->HT);    if (!do_mmap)        for (i = 0; i < model->ucount; i++)            free(word_str[i]);    free(word_str);    return 0;}int32lm_read(char const *filename, char const *lmname,        double lw, double uw, double wip){    return lm_read_clm(filename, lmname, lw, uw, wip, NULL, 0);}voidlm_init_oov(void){    int32 i, j, baseid;    int32 first_oov = 0, last_oov = -1;    lm_t *model;    model = lm_name2lm("");    /* Add initial list of OOV words to LM unigrams */    first_oov = dict_get_first_initial_oov();    last_oov = dict_get_last_initial_oov();    E_INFO("Adding %d initial OOV words to LM\n",           last_oov - first_oov + 1);    oov_ugprob = cmd_ln_float32("-oovugprob");    for (i = first_oov; i <= last_oov; i++) {        /* Add only base pronunciations */        if ((baseid = dictid_to_baseid(word_dict, i)) == i) {            if ((j = lm_add_word(model, i)) >= 0)                model->dictwid_map[i] = j;        }    }}/* * Add new word with given dictionary wid and unigram prob = oov_ugprob to * model->unigrams. * Return LM wid of inserted word if successful, otherwise -1. * (Currently some problems with adding alternative pronunciations...) */int32lm_add_word(lm_t * model, int32 dictwid){    /* Make sure new word not already in LM */    if (model->dictwid_map[dictwid] >= 0) {        E_WARN("lm_add_word: Word '%s' already in LM, ignored\n",               dictid_to_str(word_dict, dictwid));        return model->dictwid_map[dictwid];    }    if (model->ucount >= model->max_ucount) {        E_ERROR("lm_add_word(%s) failed; LM full\n",                dictid_to_str(word_dict, dictwid));        return -1;    }    /* Append new word to unigrams */    model->unigrams[model->ucount].mapid = dictwid;    model->unigrams[model->ucount].prob1.l =        LWMUL(LOG10TOLOG(oov_ugprob), model->lw) + model->log_wip;    model->unigrams[model->ucount].bo_wt1.l =        LWMUL(LOG10TOLOG(0.0), model->lw);    /* Advance the sentinel unigram */    model->unigrams[model->ucount + 1].bigrams =        model->unigrams[model->ucount].bigrams;    /* Update dictwid_map for this LM */    model->dictwid_map[dictwid] = model->ucount;    return (model->ucount++);}/* * Add named model to list of models.  If another with same name exists, delete it first. */voidlm_add(char const *lmname, lm_t * model, double lw, double uw, double wip){    if (lmname_to_id(lmname) >= 0)        lm_delete(lmname);    model->tginfo =        ckd_calloc(model->max_ucount, sizeof(tginfo_t *));    if (n_lm == n_lm_alloc) {        lmset = ckd_realloc(lmset, (n_lm + 15) * sizeof(struct lmset_s));        n_lm_alloc += 15;    }    lmset[n_lm].lm = model;    lmset[n_lm].name = ckd_salloc(lmname);    lm_set_param(model, lw, uw, wip, FALSE);    n_lm++;    E_INFO("LM(\"%s\") added\n", lmname);}/* * Delete named LM from list of LMs and reclaim all space. */int32lm_delete(char const *name){    int32 i, u;    lm_t *model;    tginfo_t *tginfo, *next_tginfo;    if ((i = lmname_to_id(name)) < 0)        return (-1);    model = lmset[i].lm;    free(model->unigrams);    free(model->bigrams);    free(model->prob2);    if (model->tcount > 0) {        free(model->trigrams);        free(model->tseg_base);        free(model->bo_wt2);        free(model->prob3);    }    hash_table_free(model->HT);    for (u = 0; u < model->max_ucount; u++)        for (tginfo = model->tginfo[u]; tginfo; tginfo = next_tginfo) {            next_tginfo = tginfo->next;            listelem_free((void *) tginfo, sizeof(tginfo_t));        }    free(model->tginfo);    if (model->lmclass)        free(model->lmclass);    free(model->inclass_ugscore);    free(model->dictwid_map);    free(model);    free(lmset[i].name);    for (; i < n_lm - 1; i++)        lmset[i] = lmset[i + 1];    --n_lm;    E_INFO("LM(\"%s\") deleted\n", name);    return (0);}/* * Set the active LM to the one identified by "name".  Return 0 if successful,
lm_3g.c - 源码说明

本页面展示了「WinCE平台上的语音识别程序」中的 lm_3g.c 源码文件，采用 C语言编程语言编写，共 2,105 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与WinCE相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?