📄 lm_3g.c
字号:
fgets(string, sizeof(string), fp); while ((strcmp(string, "\\data\\\n") != 0) && (!feof(fp))); if (strcmp(string, "\\data\\\n") != 0) E_FATAL("No \\data\\ mark in LM file\n"); *n_ug = *n_bg = *n_tg = 0; while (fgets(string, sizeof(string), fp) != NULL) { if (sscanf(string, "ngram %d=%d", &ngram, &ngram_cnt) != 2) break; switch (ngram) { case 1: *n_ug = ngram_cnt; break; case 2: *n_bg = ngram_cnt; break; case 3: *n_tg = ngram_cnt; break; default: E_FATAL("Unknown ngram (%d)\n", ngram); break; } } /* Position file to just after the unigrams header '\1-grams:\' */ while ((strcmp(string, "\\1-grams:\n") != 0) && (!feof(fp))) fgets(string, sizeof(string), fp); /* Check counts; NOTE: #trigrams *CAN* be 0 */ if ((*n_ug <= 0) || (*n_bg <= 0) || (*n_tg < 0)) E_FATAL("Bad or missing ngram count\n");}/* * Read in the unigrams from given file into the LM structure model. On * entry to this procedure, the file pointer is positioned just after the * header line '\1-grams:'. */static voidReadUnigrams(FILE * fp, lm_t * model){ char string[256]; char name[128]; int32 wcnt; float p1, bo_wt; E_INFO("Reading unigrams\n"); wcnt = 0; while ((fgets(string, sizeof(string), fp) != NULL) && (strcmp(string, "\\2-grams:\n") != 0)) { if (sscanf(string, "%f %s %f", &p1, name, &bo_wt) != 3) { if (string[0] != '\n') E_WARN("Format error; unigram ignored:%s", string); continue; } if (wcnt >= model->ucount) E_FATAL("Too many unigrams\n"); /* Associate name with word id */ word_str[wcnt] = ckd_salloc(name); hash_table_enter(model->HT, word_str[wcnt], (void *) wcnt); model->unigrams[wcnt].prob1.f = p1; model->unigrams[wcnt].bo_wt1.f = bo_wt; model->unigrams[wcnt].mapid = wcnt; wcnt++; } if (model->ucount != wcnt) { E_WARN("lm_t.ucount(%d) != #unigrams read(%d)\n", model->ucount, wcnt); model->ucount = wcnt; }}/* * Read bigrams from given file into given model structure. File may be arpabo * or arpabo-id format, depending on idfmt = 0 or 1. */static voidReadBigrams(FILE * fp, lm_t * model, int32 idfmt){ char string[1024], word1[256], word2[256]; int32 w1, w2, prev_w1, bgcount, p; bigram_t *bgptr; float p2, bo_wt; int32 n_fld, n; E_INFO("Reading bigrams\n"); bgcount = 0; bgptr = model->bigrams; prev_w1 = -1; n_fld = (model->tcount > 0) ? 4 : 3; bo_wt = 0.0; while (fgets(string, sizeof(string), fp) != NULL) { if (!idfmt) n = sscanf(string, "%f %s %s %f", &p2, word1, word2, &bo_wt); else n = sscanf(string, "%f %d %d %f", &p2, &w1, &w2, &bo_wt); if (n < n_fld) { if (string[0] != '\n') break; continue; } if (!idfmt) { if ((w1 = wstr2wid(model, word1)) == NO_WORD) E_FATAL("Unknown word: %s\n", word1); if ((w2 = wstr2wid(model, word2)) == NO_WORD) E_FATAL("Unknown word: %s\n", word2); } else { if ((w1 >= model->ucount) || (w2 >= model->ucount) || (w1 < 0) || (w2 < 0)) E_FATAL("Bad bigram: %s", string); } /* HACK!! to quantize probs to 4 decimal digits */ p = p2 * 10000; p2 = p * 0.0001; p = bo_wt * 10000; bo_wt = p * 0.0001; if (bgcount >= model->bcount) E_FATAL("Too many bigrams\n"); bgptr->wid = w2; bgptr->prob2 = sorted_id(&sorted_prob2, &p2); if (model->tcount > 0) bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt); if (w1 != prev_w1) { if (w1 < prev_w1) E_FATAL("Bigrams not in unigram order\n"); for (prev_w1++; prev_w1 <= w1; prev_w1++) model->unigrams[prev_w1].bigrams = bgcount; prev_w1 = w1; } bgcount++; bgptr++; if ((bgcount & 0x0000ffff) == 0) { E_INFOCONT("."); } } if ((strcmp(string, "\\end\\\n") != 0) && (strcmp(string, "\\3-grams:\n") != 0)) E_FATAL("Bad bigram: %s\n", string); for (prev_w1++; prev_w1 <= model->ucount; prev_w1++) model->unigrams[prev_w1].bigrams = bgcount;}/* * Very similar to ReadBigrams. */static voidReadTrigrams(FILE * fp, lm_t * model, int32 idfmt){ char string[1024], word1[256], word2[256], word3[256]; int32 i, n, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg, p; int32 seg, prev_seg, prev_seg_lastbg; trigram_t *tgptr; bigram_t *bgptr; float p3; E_INFO("Reading trigrams\n"); tgcount = 0; tgptr = model->trigrams; prev_w1 = -1; prev_w2 = -1; prev_bg = -1; prev_seg = -1; while (fgets(string, sizeof(string), fp) != NULL) { if (!idfmt) n = sscanf(string, "%f %s %s %s", &p3, word1, word2, word3); else n = sscanf(string, "%f %d %d %d", &p3, &w1, &w2, &w3); if (n != 4) { if (string[0] != '\n') break; continue; } if (!idfmt) { if ((w1 = wstr2wid(model, word1)) == NO_WORD) E_FATAL("Unknown word: %s\n", word1); if ((w2 = wstr2wid(model, word2)) == NO_WORD) E_FATAL("Unknown word: %s\n", word2); if ((w3 = wstr2wid(model, word3)) == NO_WORD) E_FATAL("Unknown word: %s\n", word3); } else { if ((w1 >= model->ucount) || (w2 >= model->ucount) || (w3 >= model->ucount) || (w1 < 0) || (w2 < 0) || (w3 < 0)) E_FATAL("Bad trigram: %s", string); } /* HACK!! to quantize probs to 4 decimal digits */ p = p3 * 10000; p3 = p * 0.0001; if (tgcount >= model->tcount) E_FATAL("Too many trigrams\n"); tgptr->wid = w3; tgptr->prob3 = sorted_id(&sorted_prob3, &p3); if ((w1 != prev_w1) || (w2 != prev_w2)) { /* Trigram for a new bigram; update tg info for all previous bigrams */ if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) E_FATAL("Trigrams not in bigram order\n"); bg = (w1 != prev_w1) ? model->unigrams[w1].bigrams : prev_bg + 1; endbg = model->unigrams[w1 + 1].bigrams; bgptr = model->bigrams + bg; for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++); if (bg >= endbg) E_FATAL("Missing bigram for trigram: %s", string); /* bg = bigram entry index for <w1,w2>. Update tseg_base */ seg = bg >> LOG_BG_SEG_SZ; for (i = prev_seg + 1; i <= seg; i++) model->tseg_base[i] = tgcount; /* Update trigrams pointers for all bigrams until bg */ if (prev_seg < seg) { int32 tgoff = 0; if (prev_seg >= 0) { tgoff = tgcount - model->tseg_base[prev_seg]; if (tgoff > 65535) E_FATAL("Offset from tseg_base > 65535\n"); } prev_seg_lastbg = ((prev_seg + 1) << LOG_BG_SEG_SZ) - 1; bgptr = model->bigrams + prev_bg; for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg; prev_bg++, bgptr++) bgptr->trigrams = tgoff; for (; prev_bg <= bg; prev_bg++, bgptr++) bgptr->trigrams = 0; } else { int32 tgoff; tgoff = tgcount - model->tseg_base[prev_seg]; if (tgoff > 65535) E_FATAL("Offset from tseg_base > 65535\n"); bgptr = model->bigrams + prev_bg; for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++) bgptr->trigrams = tgoff; } prev_w1 = w1; prev_w2 = w2; prev_bg = bg; prev_seg = seg; } tgcount++; tgptr++; if ((tgcount & 0x0000ffff) == 0) { E_INFOCONT("."); } } if (strcmp(string, "\\end\\\n") != 0) E_FATAL("Bad trigram: %s\n", string); for (prev_bg++; prev_bg <= model->bcount; prev_bg++) { if ((prev_bg & (BG_SEG_SZ - 1)) == 0) model->tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount; if ((tgcount - model->tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535) E_FATAL("Offset from tseg_base > 65535\n"); model->bigrams[prev_bg].trigrams = tgcount - model->tseg_base[prev_bg >> LOG_BG_SEG_SZ]; }}static FILE *lm_file_open(char const *filename, int32 usepipe){ FILE *fp; if (usepipe) {#if defined(_WIN32_WCE) || defined(GNUWINCE) E_FATAL("No popen() on WinCE\n");#else char command[1024];#ifdef WIN32 /* FIXME: COMPLETELY BOGUS!!! */ sprintf(command, "D:\\compress\\gzip.exe -d -c %s", filename); if ((fp = _popen(command, "r")) == NULL) E_FATAL("Cannot popen %s\n", command);#else sprintf(command, "zcat %s", filename); if ((fp = popen(command, "r")) == NULL) E_FATAL("Cannot popen %s\n", command);#endif /* !WIN32 */#endif /* !GNUWINCE */ } else { fp = myfopen(filename, "r"); } return (fp);}static int32lm_get_classid(lm_t * model, char *name){ int32 i; if (!model->lmclass) return -1; for (i = 0; i < model->n_lmclass; i++) { if (strcmp(lmclass_getname(model->lmclass[i]), name) == 0) return (i + LM_CLASSID_BASE); } return -1;}static int32get_dict_size(int32 *inout_n_unigram, char const *lmname){ int32 dict_size, max_new_oov; dict_size = word_dict->dict_entry_count; E_INFO("%d words in dictionary\n", dict_size); /* * If this is the "BASE" LM also count space for OOVs and words added at run time. * UGLY!! Assumes that OOVs will only be added to LM with no name. */ if (lmname[0] == '\0') { int32 first_oov, last_oov; first_oov = dict_get_first_initial_oov(); last_oov = dict_get_last_initial_oov(); *inout_n_unigram += (last_oov - first_oov + 1); } /* Add space for words added in at run time */ max_new_oov = cmd_ln_int32("-maxnewoov"); *inout_n_unigram += max_new_oov; if (dict_size >= 65535) E_FATAL("#dict-words(%d) > 65534\n", dict_size); return dict_size;}static int32lmtext_load(char const *filename, char const *lmname, lm_t **out_model){ FILE *fp; size_t k; int32 usingPipe = FALSE; int32 n_unigram; int32 n_bigram; int32 n_trigram; lm_t *model; int32 idfmt, dict_size, i; char *kbdumpdir, dumpfile[1024]; /* Check if a compressed file */ k = strlen(filename);#ifdef WIN32 usingPipe = (k > 3) && ((strcmp(filename + k - 3, ".gz") == 0) || (strcmp(filename + k - 3, ".GZ") == 0));#else usingPipe = (k > 2) && ((strcmp(filename + k - 2, ".Z") == 0) || (strcmp(filename + k - 2, ".z") == 0));#endif /* Check if an .arpabo-id format file; More HACK!! Hardwired check for -id */ if (usingPipe) k -= 2; idfmt = ((k > 3) && (strncmp(filename + k - 3, "-id", 3) == 0)); fp = lm_file_open(filename, usingPipe); /* Read #unigrams, #bigrams, #trigrams from file */ ReadNgramCounts(fp, &n_unigram, &n_bigram, &n_trigram); E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram); /* Determine dictionary size (for dict-wid -> LM-wid map) */ dict_size = get_dict_size(&n_unigram, lmname); /* Allocate space for word strings. */ word_str = ckd_calloc(n_unigram, sizeof(char *)); /* Allocate space for LM, including initial OOVs and placeholders; initialize it */ model = NewModel(n_unigram, n_bigram, n_trigram, dict_size); /* Create name for binary dump form of Darpa LM file */#ifdef WIN32 for (i = strlen(filename) - 1; (i >= 0) && (filename[i] != '\\') && (filename[i] != '/'); --i);#else for (i = strlen(filename) - 1; (i >= 0) && (filename[i] != '/'); --i);#endif i++; kbdumpdir = cmd_ln_str("-lmdumpdir"); /* form dumpfilename */ if (kbdumpdir)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -