📄 lm_3g.c
字号:
* -1 otherwise. */int32lm_set_current(char const *name){ int32 i; if ((i = lmname_to_id(name)) < 0) return (-1); lmp = lmset[i].lm; return (0);}static int32lmname_to_id(char const *name){ int32 i; for (i = 0; (i < n_lm) && (strcmp(lmset[i].name, name) != 0); i++); return ((i < n_lm) ? i : -1);}lm_t *lm_name2lm(char const *name){ int32 i; i = lmname_to_id(name); return ((i >= 0) ? lmset[i].lm : NULL);}char *get_current_lmname(){ int32 i; for (i = 0; (i < n_lm) && (lmset[i].lm != lmp); i++); return ((i < n_lm) ? lmset[i].name : NULL);}lm_t *lm_get_current(){ return (lmp);}int32get_n_lm(){ return (n_lm);}int32lm3g_n_lm(void){ return n_lm;}char *lm3g_index2name(int k){ if ((k >= 0) && (k < n_lm)) return (lmset[k].name); else return NULL;}/* * dict base wid; check if present in LM. * return TRUE if present, FALSE otherwise. */int32dictwd_in_lm(wid)int32 wid;{ return (lmp->dictwid_map[wid] >= 0);}/* * Load pre-compiled trigram LM file, if it exists, into model. If * file does not exist, or is not a dump file, return 0. Otherwise, * if successful, return 1. */static int32lm3g_load(char const *file, char const *lmname, lm_t **out_model, char const *lmfile){ int32 i, j, k, vn, ts, err; int32 n_unigram; int32 n_bigram; int32 n_trigram; int32 dict_size; FILE *fp; char str[1024]; unigram_t *ugptr; bigram_t *bgptr; trigram_t *tgptr; char *tmp_word_str; int do_mmap, do_swap, fd = -1; char *map_base = NULL; size_t offset = 0, filesize; lm_t *model; err = 0; E_INFO("Trying to read %s as precompiled dump file\n", file); if ((fp = fopen(file, "rb")) == NULL) { E_INFO("Precompiled file not found\n"); return -1; } do_swap = 0; fread(&k, sizeof(k), 1, fp); if (k != strlen(darpa_hdr)+1) { SWAP_INT32(&k); if (k != strlen(darpa_hdr)+1) { E_ERROR("Wrong magic header size number %x: %s is not a dump file\n", k); fclose(fp); return -1; } do_swap = 1; } if (fread(str, sizeof(char), k, fp) != (size_t) k) E_FATAL("Cannot read header\n"); if (strncmp(str, darpa_hdr, k) != 0) { E_ERROR("Wrong header %s: %s is not a dump file\n", darpa_hdr); fclose(fp); return -1; } E_INFO("%s\n", str); do_mmap = cmd_ln_boolean("-mmap"); if (do_mmap) { if (do_swap) { E_INFO ("Byteswapping required, will not use memory-mapped I/O for LM file\n"); do_mmap = 0; } else { E_INFO("Will use memory-mapped I/O for LM file\n"); fd = fileno(fp); } } fread(&k, sizeof(k), 1, fp); if (do_swap) SWAP_INT32(&k); if (fread(str, sizeof(char), k, fp) != (size_t) k) E_FATAL("Cannot read LM filename in header\n"); /* read version#, if present (must be <= 0) */ fread(&vn, sizeof(vn), 1, fp); if (do_swap) SWAP_INT32(&vn); if (vn <= 0) { /* read and don't compare timestamps (we don't care) */ fread(&ts, sizeof(ts), 1, fp); if (do_swap) SWAP_INT32(&ts); /* read and skip format description */ for (;;) { fread(&k, sizeof(k), 1, fp); if (do_swap) SWAP_INT32(&k); if (k == 0) break; if (fread(str, sizeof(char), k, fp) != (size_t) k) E_FATAL("fread(word) failed\n"); } /* read model->ucount */ fread(&n_unigram, sizeof(n_unigram), 1, fp); if (do_swap) SWAP_INT32(&n_unigram); } else { n_unigram = vn; } /* read model->bcount, tcount */ fread(&n_bigram, sizeof(n_bigram), 1, fp); if (do_swap) SWAP_INT32(&n_bigram); fread(&n_trigram, sizeof(n_trigram), 1, fp); if (do_swap) SWAP_INT32(&n_trigram); E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram); /* Determine dictionary size (for dict-wid -> LM-wid map) */ dict_size = get_dict_size(&n_unigram, lmname); /* Allocate space for word strings. */ word_str = ckd_calloc(n_unigram, sizeof(char *)); /* Allocate space for LM, including initial OOVs and placeholders; initialize it */ model = NewModel(n_unigram, n_bigram, n_trigram, dict_size); /* read unigrams (always in memory, as they contain dictionary * mappings that can't be precomputed, and also could have OOVs added) */ if (fread(model->unigrams, sizeof(unigram_t), model->ucount + 1, fp) != (size_t) model->ucount + 1) E_FATAL("fread(unigrams) failed\n"); if (do_swap) { for (i = 0, ugptr = model->unigrams; i <= model->ucount; i++, ugptr++) { SWAP_INT32(&ugptr->mapid); SWAP_INT32(&ugptr->prob1.l); SWAP_INT32(&ugptr->bo_wt1.l); SWAP_INT32(&ugptr->bigrams); } } for (i = 0, ugptr = model->unigrams; i < model->ucount; i++, ugptr++) { if (ugptr->mapid != i) err = 1; ugptr->mapid = i; } if (err) E_WARN("Corrected corrupted dump file created by buggy fbs8\n"); E_INFO("%8d = LM.unigrams(+trailer) read\n", model->ucount); /* Now mmap() the file and read in the rest of the (read-only) stuff. */ if (do_mmap) { offset = ftell(fp); fseek(fp, 0, SEEK_END); filesize = ftell(fp); fseek(fp, offset, SEEK_SET); map_base = s2_mmap(file); } /* read bigrams */ if (do_mmap) { model->bigrams = (bigram_t *) (map_base + offset); offset += (model->bcount + 1) * sizeof(bigram_t); } else { model->bigrams = ckd_calloc(model->bcount + 1, sizeof(bigram_t)); if (fread(model->bigrams, sizeof(bigram_t), model->bcount + 1, fp) != (size_t) model->bcount + 1) E_FATAL("fread(bigrams) failed\n"); if (do_swap) { for (i = 0, bgptr = model->bigrams; i <= model->bcount; i++, bgptr++) { SWAP_INT16(&bgptr->wid); SWAP_INT16(&bgptr->prob2); SWAP_INT16(&bgptr->bo_wt2); SWAP_INT16(&bgptr->trigrams); } } } E_INFO("%8d = LM.bigrams(+trailer) read\n", model->bcount); /* read trigrams */ if (model->tcount > 0) { if (do_mmap) { model->trigrams = (trigram_t *) (map_base + offset); offset += model->tcount * sizeof(trigram_t); } else { model->trigrams = ckd_calloc(model->tcount, sizeof(trigram_t)); if (fread (model->trigrams, sizeof(trigram_t), model->tcount, fp) != (size_t) model->tcount) E_FATAL("fread(trigrams) failed\n"); if (do_swap) { for (i = 0, tgptr = model->trigrams; i < model->tcount; i++, tgptr++) { SWAP_INT16(&tgptr->wid); SWAP_INT16(&tgptr->prob3); } } } E_INFO("%8d = LM.trigrams read\n", model->tcount); } /* read n_prob2 and prob2 array (in memory, should be pre-scaled on disk) */ if (do_mmap) fseek(fp, offset, SEEK_SET); fread(&k, sizeof(k), 1, fp); if (do_swap) SWAP_INT32(&k); model->n_prob2 = k; model->prob2 = ckd_calloc(k, sizeof(log_t)); if (fread(model->prob2, sizeof(log_t), k, fp) != (size_t) k) E_FATAL("fread(prob2) failed\n"); if (do_swap) for (i = 0; i < k; i++) SWAP_INT32(&model->prob2[i].l); E_INFO("%8d = LM.prob2 entries read\n", k); /* read n_bo_wt2 and bo_wt2 array (in memory) */ if (model->tcount > 0) { fread(&k, sizeof(k), 1, fp); if (do_swap) SWAP_INT32(&k); model->n_bo_wt2 = k; model->bo_wt2 = ckd_calloc(k, sizeof(log_t)); if (fread(model->bo_wt2, sizeof(log_t), k, fp) != (size_t) k) E_FATAL("fread(bo_wt2) failed\n"); if (do_swap) for (i = 0; i < k; i++) SWAP_INT32(&model->bo_wt2[i].l); E_INFO("%8d = LM.bo_wt2 entries read\n", k); } /* read n_prob3 and prob3 array (in memory) */ if (model->tcount > 0) { fread(&k, sizeof(k), 1, fp); if (do_swap) SWAP_INT32(&k); model->n_prob3 = k; model->prob3 = ckd_calloc(k, sizeof(log_t)); if (fread(model->prob3, sizeof(log_t), k, fp) != (size_t) k) E_FATAL("fread(prob3) failed\n"); if (do_swap) for (i = 0; i < k; i++) SWAP_INT32(&model->prob3[i].l); E_INFO("%8d = LM.prob3 entries read\n", k); } /* read tseg_base size and tseg_base */ /* FIXME: There could be alignment issues here. */ if (do_mmap) offset = ftell(fp); if (model->tcount > 0) { if (do_mmap) { k = *(int32 *) (map_base + offset); offset += sizeof(int32); model->tseg_base = (int32 *) (map_base + offset); offset += k * sizeof(int32); } else { k = (model->bcount + 1) / BG_SEG_SZ + 1; fread(&k, sizeof(k), 1, fp); if (do_swap) SWAP_INT32(&k); model->tseg_base = ckd_calloc(k, sizeof(int32)); if (fread(model->tseg_base, sizeof(int32), k, fp) != (size_t) k) E_FATAL("fread(tseg_base) failed\n"); if (do_swap) for (i = 0; i < k; i++) SWAP_INT32(&model->tseg_base[i]); } E_INFO("%8d = LM.tseg_base entries read\n", k); } /* read ascii word strings */ if (do_mmap) { k = *(int32 *) (map_base + offset); offset += sizeof(int32); tmp_word_str = (char *) (map_base + offset); offset += k; } else { fread(&k, sizeof(k), 1, fp); if (do_swap) SWAP_INT32(&k); tmp_word_str = ckd_calloc(k, sizeof(char)); if (fread(tmp_word_str, sizeof(char), k, fp) != (size_t) k) E_FATAL("fread(word-string) failed\n"); } /* First make sure string just read contains ucount words (PARANOIA!!) */ for (i = 0, j = 0; i < k; i++) if (tmp_word_str[i] == '\0') j++; if (j != model->ucount) E_FATAL("Error reading word strings\n"); /* Break up string just read into words */ if (do_mmap) { j = 0; for (i = 0; i < model->ucount; i++) { word_str[i] = tmp_word_str + j; j += strlen(word_str[i]) + 1; } } else { j = 0; for (i = 0; i < model->ucount; i++) { word_str[i] = ckd_salloc(tmp_word_str + j); j += strlen(word_str[i]) + 1; } free(tmp_word_str); } E_INFO("%8d = ascii word strings read\n", i); *out_model = model; return 0;}static char const *fmtdesc[] = { "BEGIN FILE FORMAT DESCRIPTION", "Header string length (int32) and string (including trailing 0)", "Original LM filename string-length (int32) and filename (including trailing 0)", "(int32) version number (present iff value <= 0)", "(int32) original LM file modification timestamp (iff version# present)", "(int32) string-length and string (including trailing 0) (iff version# present)", "... previous entry continued any number of times (iff version# present)", "(int32) 0 (terminating sequence of strings) (iff version# present)", "(int32) lm_t.ucount (must be > 0)", "(int32) lm_t.bcount", "(int32) lm_t.tcount", "lm_t.ucount+1 unigrams (including sentinel)", "lm_t.bcount+1 bigrams (including sentinel)", "lm_t.tcount trigrams (present iff lm_t.tcount > 0)", "(int32) lm_t.n_prob2", "(int32) lm_t.prob2[]", "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)", "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)", "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)", "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)", "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)", "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)", "(int32) Sum(all word string-lengths, including trailing 0 for each)", "All word strings (including trailing 0 for each)", "END FILE FORMAT DESCRIPTION", NULL,};/* * Dump internal LM to file. Format described above * We don't swap bytes because it could be mmap()ed */static int32lm3g_dump(char const *file, lm_t * model, char const *lmfile){ int32 i, k, zero = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -