📄 lm_3g.c
字号:
sprintf(dumpfile, "%s/%s.DMP", kbdumpdir, filename + i); /* * Allocate one extra unigram and bigram entry: sentinels to terminate * followers (bigrams and trigrams, respectively) of previous entry. */ model->bigrams = ckd_calloc(n_bigram + 1, sizeof(bigram_t)); if (n_trigram > 0) model->trigrams = ckd_calloc(n_trigram, sizeof(trigram_t)); if (n_trigram > 0) { model->tseg_base = ckd_calloc((n_bigram + 1) / BG_SEG_SZ + 1, sizeof(int32));#if 0 E_INFO("%8d = tseg_base entries allocated\n", (n_bigram + 1) / BG_SEG_SZ + 1);#endif } ReadUnigrams(fp, model); E_INFO("%8d = #unigrams created\n", model->ucount); init_sorted_list(&sorted_prob2); if (model->tcount > 0) init_sorted_list(&sorted_bo_wt2); ReadBigrams(fp, model, idfmt); model->bcount = FIRST_BG(model, model->ucount); model->n_prob2 = sorted_prob2.free; model->prob2 = vals_in_sorted_list(&sorted_prob2); free_sorted_list(&sorted_prob2); E_INFO("\n%8d = #bigrams created\n", model->bcount); E_INFO("%8d = #prob2 entries\n", model->n_prob2); if (model->tcount > 0) { /* Create trigram bo-wts array */ model->n_bo_wt2 = sorted_bo_wt2.free; model->bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2); free_sorted_list(&sorted_bo_wt2); E_INFO("%8d = #bo_wt2 entries\n", model->n_bo_wt2); init_sorted_list(&sorted_prob3); ReadTrigrams(fp, model, idfmt); model->tcount = FIRST_TG(model, model->bcount); model->n_prob3 = sorted_prob3.free; model->prob3 = vals_in_sorted_list(&sorted_prob3); E_INFO("\n%8d = #trigrams created\n", model->tcount); E_INFO("%8d = #prob3 entries\n", model->n_prob3); free_sorted_list(&sorted_prob3); } /* Now dump this file if requrested. */ /* HACK!! to avoid unnecessarily creating dump files for small LMs */ if (kbdumpdir && (model->bcount + model->tcount > 200000)) lm3g_dump(dumpfile, model, filename); if (usingPipe) {#if defined(_WIN32_WCE) || defined(GNUWINCE) E_FATAL("No popen() on WinCE!\n");#else /* !GNUWINCE */#ifdef WIN32 _pclose(fp);#else pclose(fp);#endif /* !WIN32 */#endif /* !GNUWINCE */ } else fclose(fp); *out_model = model; return 0;}/* * Read in a trigram language model from the given file. The LM tokens can be word * classes. However a given actual word can belong to AT MOST ONE of the LM classes * used by this LM. */int32lm_read_clm(char const *filename, char const *lmname, double lw, double uw, double wip, lmclass_t * lmclass, int32 n_lmclass){ lm_t *model; int32 i, j, k, last_bg, last_tg; int32 dictid, classid, notindict, maperr; lmclass_word_t lmclass_word; int do_mmap; E_INFO("Reading LM file %s (name \"%s\")\n", filename, lmname); do_mmap = cmd_ln_boolean("-mmap"); /* Make sure no LM with same lmname already exists; if so, delete it */ if (lmname_to_id(lmname) >= 0) lm_delete(lmname); /* Try to read it as a dump file. */ if (lm3g_load(filename, lmname, &model, filename) < 0) { if (lmtext_load(filename, lmname, &model) < 0) { E_FATAL("Failed to load LM (text or dump format) from %s\n", filename); } } lmp = model; /* * Make a local copy of the LM Classes used by this LM. The unigrams_t.mapid * field can refer to the index of the local copy array. */ if (n_lmclass > 0) { model->lmclass = ckd_calloc(n_lmclass, sizeof(lmclass_t)); for (i = 0; i < n_lmclass; i++) model->lmclass[i] = lmclass[i]; } else model->lmclass = NULL; model->n_lmclass = n_lmclass; model->inclass_ugscore = ckd_calloc(model->dict_size, sizeof(int32)); /* * Create mapping from dictionary ID to unigram index. And also mapping for * LM wids (to dictionary or LMclass ids) in unigram array. */ for (i = 0; i < model->dict_size; i++) model->dictwid_map[i] = LM_DICTWID_BADMAP; notindict = 0; maperr = 0; for (i = 0; i < model->ucount; i++) { model->unigrams[i].mapid = kb_get_word_id(word_str[i]); classid = lm_get_classid(model, word_str[i]); if (model->unigrams[i].mapid >= 0) { /* unigram[i] is a dictionary word */ if (classid >= 0) { E_ERROR("'%s' is both a word and an LM class name\n", word_str[i]); maperr = 1; } else model->dictwid_map[model->unigrams[i].mapid] = i; } else { if (classid >= 0) { /* unigram[i] is an LM class */ model->unigrams[i].mapid = classid; /* Create entries in dictwid_map for each word in the class */ lmclass_word = lmclass_firstword(LM_CLASSID_TO_CLASS(model, classid)); while (lmclass_isword(lmclass_word)) { dictid = lmclass_getwid(lmclass_word); if (dictid >= 0) { if (model->dictwid_map[dictid] >= 0) { E_ERROR("Multiple mappings of '%s' in LM\n", lmclass_getword(lmclass_word)); maperr = 1; } else { model->dictwid_map[dictid] = i; model->inclass_ugscore[dictid] = lmclass_getprob(lmclass_word) * lw; } } else { E_ERROR("'%s' is in LM class definition but not in dictionary\n", lmclass_getword(lmclass_word)); notindict++; } lmclass_word = lmclass_nextword(LM_CLASSID_TO_CLASS (model, classid), lmclass_word); } } else { E_ERROR("'%s' is in LM unigrams but not in dictionary\n", word_str[i]); notindict++; } } } if (maperr) E_FATAL("Errors in LM; exiting\n"); if (notindict > 0) E_WARN("%d LM words not in dict; ignored\n", notindict); /* * Discourage expansion of end_sym and transition to start_sym. (The given * Darpa LM may contain some spurious values that don't reflect these * requirements.) */ /* bo_wt(</s>) = MIN_PROB_F */ for (i = 0; (i < model->ucount) && (strcmp(word_str[i], end_sym) != 0); i++); E_INFO("bo_wt(%s) changed from %.4f to %.4f\n", word_str[i], model->unigrams[i].bo_wt1.f, MIN_PROB_F); model->unigrams[i].bo_wt1.f = MIN_PROB_F; /* unigram prob(<s>) = MIN_PROB_F */ for (i = 0; (i < model->ucount) && (strcmp(word_str[i], start_sym) != 0); i++); E_INFO("prob(%s) changed from %.4f to %.4f\n", word_str[i], model->unigrams[i].prob1.f, MIN_PROB_F); model->unigrams[i].prob1.f = MIN_PROB_F; /* bigram prob(<s>,<s>) = MIN_PROB_F (if bigram exists) */ j = FIRST_BG(model, i); last_bg = LAST_BG(model, i); for (; (j <= last_bg) && (strcmp(word_str[BG_WID(model, j)], start_sym) != 0); j++); if (j <= last_bg) { E_INFO("prob(%s,%s) changed from %.4f to %.4f\n", word_str[i], word_str[BG_WID(model, j)], model->prob2[model->bigrams[j].prob2].f, model->prob2[0].f); if (!do_mmap) model->bigrams[j].prob2 = 0; if (model->tcount > 0) { /* trigram prob(<s>,<s>,<s>) = MIN_PROB_F (if trigram exists) */ k = FIRST_TG(model, j); last_tg = LAST_TG(model, j); for (; k <= last_tg; k++) { if (strcmp(word_str[TG_WID(model, k)], start_sym) == 0) break; } if (k <= last_tg) { E_INFO("prob(%s,%s,%s) changed from %.4f to %.4f\n", word_str[i], word_str[BG_WID(model, j)], word_str[TG_WID(model, k)], model->prob3[model->trigrams[k].prob3].f, model->prob3[0].f); if (!do_mmap) model->trigrams[k].prob3 = 0; } } } /* bigram prob(<s>,</s>) = MIN_PROB_F (if bigram exists) */ j = FIRST_BG(model, i); last_bg = LAST_BG(model, i); for (; (j <= last_bg) && (strcmp(word_str[BG_WID(model, j)], end_sym) != 0); j++); if (j <= last_bg) { E_INFO("prob(%s,%s) changed from %.4f to %.4f\n", word_str[i], word_str[BG_WID(model, j)], model->prob2[model->bigrams[j].prob2].f, model->prob2[0].f); if (!do_mmap) model->bigrams[j].prob2 = 0; } lm_add(lmname, model, lw, uw, wip); hash_table_free(model->HT); if (!do_mmap) for (i = 0; i < model->ucount; i++) free(word_str[i]); free(word_str); return 0;}int32lm_read(char const *filename, char const *lmname, double lw, double uw, double wip){ return lm_read_clm(filename, lmname, lw, uw, wip, NULL, 0);}voidlm_init_oov(void){ int32 i, j, baseid; int32 first_oov = 0, last_oov = -1; lm_t *model; model = lm_name2lm(""); /* Add initial list of OOV words to LM unigrams */ first_oov = dict_get_first_initial_oov(); last_oov = dict_get_last_initial_oov(); E_INFO("Adding %d initial OOV words to LM\n", last_oov - first_oov + 1); oov_ugprob = cmd_ln_float32("-oovugprob"); for (i = first_oov; i <= last_oov; i++) { /* Add only base pronunciations */ if ((baseid = dictid_to_baseid(word_dict, i)) == i) { if ((j = lm_add_word(model, i)) >= 0) model->dictwid_map[i] = j; } }}/* * Add new word with given dictionary wid and unigram prob = oov_ugprob to * model->unigrams. * Return LM wid of inserted word if successful, otherwise -1. * (Currently some problems with adding alternative pronunciations...) */int32lm_add_word(lm_t * model, int32 dictwid){ /* Make sure new word not already in LM */ if (model->dictwid_map[dictwid] >= 0) { E_WARN("lm_add_word: Word '%s' already in LM, ignored\n", dictid_to_str(word_dict, dictwid)); return model->dictwid_map[dictwid]; } if (model->ucount >= model->max_ucount) { E_ERROR("lm_add_word(%s) failed; LM full\n", dictid_to_str(word_dict, dictwid)); return -1; } /* Append new word to unigrams */ model->unigrams[model->ucount].mapid = dictwid; model->unigrams[model->ucount].prob1.l = LWMUL(LOG10TOLOG(oov_ugprob), model->lw) + model->log_wip; model->unigrams[model->ucount].bo_wt1.l = LWMUL(LOG10TOLOG(0.0), model->lw); /* Advance the sentinel unigram */ model->unigrams[model->ucount + 1].bigrams = model->unigrams[model->ucount].bigrams; /* Update dictwid_map for this LM */ model->dictwid_map[dictwid] = model->ucount; return (model->ucount++);}/* * Add named model to list of models. If another with same name exists, delete it first. */voidlm_add(char const *lmname, lm_t * model, double lw, double uw, double wip){ if (lmname_to_id(lmname) >= 0) lm_delete(lmname); model->tginfo = ckd_calloc(model->max_ucount, sizeof(tginfo_t *)); if (n_lm == n_lm_alloc) { lmset = ckd_realloc(lmset, (n_lm + 15) * sizeof(struct lmset_s)); n_lm_alloc += 15; } lmset[n_lm].lm = model; lmset[n_lm].name = ckd_salloc(lmname); lm_set_param(model, lw, uw, wip, FALSE); n_lm++; E_INFO("LM(\"%s\") added\n", lmname);}/* * Delete named LM from list of LMs and reclaim all space. */int32lm_delete(char const *name){ int32 i, u; lm_t *model; tginfo_t *tginfo, *next_tginfo; if ((i = lmname_to_id(name)) < 0) return (-1); model = lmset[i].lm; free(model->unigrams); free(model->bigrams); free(model->prob2); if (model->tcount > 0) { free(model->trigrams); free(model->tseg_base); free(model->bo_wt2); free(model->prob3); } hash_table_free(model->HT); for (u = 0; u < model->max_ucount; u++) for (tginfo = model->tginfo[u]; tginfo; tginfo = next_tginfo) { next_tginfo = tginfo->next; listelem_free((void *) tginfo, sizeof(tginfo_t)); } free(model->tginfo); if (model->lmclass) free(model->lmclass); free(model->inclass_ugscore); free(model->dictwid_map); free(model); free(lmset[i].name); for (; i < n_lm - 1; i++) lmset[i] = lmset[i + 1]; --n_lm; E_INFO("LM(\"%s\") deleted\n", name); return (0);}/* * Set the active LM to the one identified by "name". Return 0 if successful,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -