📄 lm.c
字号:
lm->byteswap = 1; else { SWAP_INT32(&k); E_FATAL("Bad magic number: %d(%08x), not an LM dumpfile??\n", k, k); } } /* Read and verify standard header string */ if (fread (str, sizeof (char), k, lm->fp) != (size_t)k) E_FATAL("fread(%s) failed\n", file); if (strncmp (str, darpa_hdr, k) != 0) E_FATAL("Bad header\n"); /* Original LM filename string size and string */ k = lm_fread_int32 (lm); if ((k < 1) || (k > 1024)) E_FATAL("Bad original filename size: %d\n", k); if (fread (str, sizeof (char), k, lm->fp) != (size_t)k) E_FATAL("fread(%s) failed\n", file); /* Version#. If present (must be <= 0); otherwise it's actually the unigram count */ vn = lm_fread_int32 (lm); if (vn <= 0) { /* Read and skip orginal file timestamp; (later compare timestamps) */ k = lm_fread_int32 (lm); /* Read and skip format description */ for (;;) { if ((k = lm_fread_int32 (lm)) == 0) break; if (fread (str, sizeof(char), k, lm->fp) != (size_t)k) E_FATAL("fread(%s) failed\n", file); } /* Read log_bg_seg_sz if present */ if (vn <= -2) { k = lm_fread_int32 (lm); if ((k < 1) || (k > 15)) E_FATAL("log2(bg_seg_sz) outside range 1..15\n", k); lm->log_bg_seg_sz = k; } else lm->log_bg_seg_sz = LOG2_BG_SEG_SZ; /* Default */ /* Read #ug */ lm->n_ug = lm_fread_int32 (lm); } else { /* No version number, actually a unigram count */ lm->n_ug = vn; lm->log_bg_seg_sz = LOG2_BG_SEG_SZ; /* Default */ } if ((lm->n_ug <= 0) || (lm->n_ug >= MAX_S3LMWID)) E_FATAL("Bad #ug: %d (must be >0, <%d\n", lm->n_ug, MAX_S3LMWID); lm->bg_seg_sz = 1 << lm->log_bg_seg_sz; /* #bigrams */ lm->n_bg = lm_fread_int32 (lm); if (lm->n_bg < 0) E_FATAL("Bad #bigrams: %d\n", lm->n_bg); /* #trigrams */ lm->n_tg = lm_fread_int32 (lm); if (lm->n_tg < 0) E_FATAL("Bad #trigrams: %d\n", lm->n_tg); /* Read ug; remember sentinel ug at the end! */ lm->ug = (ug_t *) ckd_calloc (lm->n_ug+1, sizeof(ug_t)); if (fread (lm->ug, sizeof(ug_t), lm->n_ug+1, lm->fp) != (size_t)(lm->n_ug+1)) E_FATAL("fread(%s) failed\n", file); if (lm->byteswap) for (i = 0; i <= lm->n_ug; i++) { SWAP_INT32(&(lm->ug[i].prob.l)); SWAP_INT32(&(lm->ug[i].bowt.l)); SWAP_INT32(&(lm->ug[i].firstbg)); } E_INFO("%8d ug\n", lm->n_ug); /* RAH, 5.1.01 - Let's try reading the whole damn thing in here */ if (isLM_IN_MEMORY) { lm->bg = (bg_t *) ckd_calloc (lm->n_bg+1,sizeof(bg_t)); lm->tg = (tg_t *) ckd_calloc (lm->n_tg+1,sizeof(tg_t)); if (lm->n_bg > 0) { /* Read bigrams; remember sentinel at the end */ lm->bgoff = ftell (lm->fp); fread (lm->bg, lm->n_bg+1,sizeof(bg_t),lm->fp); E_INFO("Read %8d bigrams [in memory]\n", lm->n_bg); lm->membg = (membg_t *) ckd_calloc (lm->n_ug, sizeof(membg_t)); } if (lm->n_tg > 0) { /* Read trigrams */ lm->tgoff = ftell (lm->fp); fread (lm->tg,lm->n_tg,sizeof(tg_t),lm->fp); E_INFO("Read %8d trigrams [in memory]\n", lm->n_tg); lm->tginfo = (tginfo_t **) ckd_calloc (lm->n_ug, sizeof(tginfo_t *)); } } else { lm->bg = NULL; lm->tg = NULL; /* Skip bigrams; remember sentinel at the end */ if (lm->n_bg > 0) { lm->bgoff = ftell (lm->fp); fseek (lm->fp, (lm->n_bg+1) * sizeof(bg_t), SEEK_CUR); E_INFO("%8d bigrams [on disk]\n", lm->n_bg); lm->membg = (membg_t *) ckd_calloc (lm->n_ug, sizeof(membg_t)); } /* Skip trigrams */ if (lm->n_tg > 0) { lm->tgoff = ftell (lm->fp); fseek (lm->fp, lm->n_tg * sizeof(tg_t), SEEK_CUR); E_INFO("%8d trigrams [on disk]\n", lm->n_tg); lm->tginfo = (tginfo_t **) ckd_calloc (lm->n_ug, sizeof(tginfo_t *)); } } if (lm->n_bg > 0) { /* Bigram probs table size */ lm->n_bgprob = lm_fread_int32 (lm); if ((lm->n_bgprob <= 0) || (lm->n_bgprob > 65536)) E_FATAL("Bad bigram prob table size: %d\n", lm->n_bgprob); /* Allocate and read bigram probs table */ lm->bgprob = (lmlog_t *) ckd_calloc (lm->n_bgprob, sizeof (lmlog_t)); if (fread(lm->bgprob, sizeof(lmlog_t), lm->n_bgprob, lm->fp) != (size_t)lm->n_bgprob) E_FATAL("fread(%s) failed\n", file); if (lm->byteswap) { for (i = 0; i < lm->n_bgprob; i++) SWAP_INT32(&(lm->bgprob[i].l)); } E_INFO("%8d bigram prob entries\n", lm->n_bgprob); } if (lm->n_tg > 0) { /* Trigram bowt table size */ lm->n_tgbowt = lm_fread_int32 (lm); if ((lm->n_tgbowt <= 0) || (lm->n_tgbowt > 65536)) E_FATAL("Bad trigram bowt table size: %d\n", lm->n_tgbowt); /* Allocate and read trigram bowt table */ lm->tgbowt = (lmlog_t *) ckd_calloc (lm->n_tgbowt, sizeof (lmlog_t)); if (fread (lm->tgbowt, sizeof (lmlog_t), lm->n_tgbowt, lm->fp) != (size_t)lm->n_tgbowt) E_FATAL("fread(%s) failed\n", file); if (lm->byteswap) { for (i = 0; i < lm->n_tgbowt; i++) SWAP_INT32(&(lm->tgbowt[i].l)); } E_INFO("%8d trigram bowt entries\n", lm->n_tgbowt); /* Trigram prob table size */ lm->n_tgprob = lm_fread_int32 (lm); if ((lm->n_tgprob <= 0) || (lm->n_tgprob > 65536)) E_FATAL("Bad trigram bowt table size: %d\n", lm->n_tgprob); /* Allocate and read trigram bowt table */ lm->tgprob = (lmlog_t *) ckd_calloc (lm->n_tgprob, sizeof (lmlog_t)); if (fread (lm->tgprob, sizeof (lmlog_t), lm->n_tgprob, lm->fp) != (size_t)lm->n_tgprob) E_FATAL("fread(%s) failed\n", file); if (lm->byteswap) { for (i = 0; i < lm->n_tgprob; i++) SWAP_INT32(&(lm->tgprob[i].l)); } E_INFO("%8d trigram prob entries\n", lm->n_tgprob); /* Trigram seg table size */ k = lm_fread_int32 (lm); if (k != (lm->n_bg+1)/lm->bg_seg_sz+1) E_FATAL("Bad trigram seg table size: %d\n", k); /* Allocate and read trigram seg table */ lm->tg_segbase = (int32 *) ckd_calloc (k, sizeof(int32)); if (fread (lm->tg_segbase, sizeof(int32), k, lm->fp) != (size_t)k) E_FATAL("fread(%s) failed\n", file); if (lm->byteswap) { for (i = 0; i < k; i++) SWAP_INT32(&(lm->tg_segbase[i])); } E_INFO("%8d trigram segtable entries (%d segsize)\n", k, lm->bg_seg_sz); } /* Read word string names */ k = lm_fread_int32 (lm); if (k <= 0) E_FATAL("Bad wordstrings size: %d\n", k); tmp_word_str = (char *) ckd_calloc (k, sizeof (char)); if (fread (tmp_word_str, sizeof(char), k, lm->fp) != (size_t)k) E_FATAL("fread(%s) failed\n", file); /* First make sure string just read contains n_ug words (PARANOIA!!) */ for (i = 0, j = 0; i < k; i++) if (tmp_word_str[i] == '\0') j++; if (j != lm->n_ug) E_FATAL("Bad #words: %d\n", j); /* Break up string just read into words */ startwid = endwid = BAD_S3LMWID; lm->wordstr = (char **) ckd_calloc (lm->n_ug, sizeof(char *)); j = 0; for (i = 0; i < lm->n_ug; i++) { if (strcmp (tmp_word_str+j, S3_START_WORD) == 0) startwid = i; else if (strcmp (tmp_word_str+j, S3_FINISH_WORD) == 0) endwid = i; lm->wordstr[i] = (char *) ckd_salloc (tmp_word_str+j); j += strlen(tmp_word_str+j) + 1; } free (tmp_word_str); E_INFO("%8d word strings\n", i); /* Force ugprob(<s>) = MIN_PROB_F */ if (IS_S3LMWID(startwid)) { lm->ug[startwid].prob.f = MIN_PROB_F; lm->startlwid = startwid; } /* Force bowt(</s>) = MIN_PROB_F */ if (IS_S3LMWID(endwid)) { lm->ug[endwid].bowt.f = MIN_PROB_F; lm->finishlwid = endwid; } if(n_lmclass_used>0) { lm_build_lmclass_info(lm,lw,uw,wip,n_lmclass_used,lmclass); } lm2logs3 (lm, uw); /* Applying unigram weight; convert to logs3 values */ /* Apply the new lw and wip values */ lm->lw = 1.0; /* The initial settings for lw and wip */ lm->wip = 0; /* logs3(1.0) */ lm_set_param (lm, lw, wip); return lm;}lm_t *lm_read (char *file, float64 lw, float64 wip, float64 uw){ int32 i, u; lm_t *lm; int32 isLM_IN_MEMORY=0; if (! file) E_FATAL("No LM file\n"); if (lw <= 0.0) E_FATAL("lw = %e\n", lw); if (wip <= 0.0) E_FATAL("wip = %e\n", wip); if ((uw < 0.0) || (uw > 1.0)) E_FATAL("uw = %e\n", uw); E_INFO ("LM read('%s', lw= %.2f, wip= %d, uw= %.2f)\n", file, lw, logs3(wip), uw); if (cmd_ln_int32 ("-lminmemory")) isLM_IN_MEMORY = 1; else isLM_IN_MEMORY = 0; /* For now, only dump files can be read; they are created offline */ lm = lm_read_dump (file, lw, wip, uw,0,NULL,0); for (u = 0; u < lm->n_ug; u++) lm->ug[u].dictwid = BAD_S3WID; /* Initialize the fast trigram cache, with all entries invalid */ lm->tgcache = (lm_tgcache_entry_t *) ckd_calloc(LM_TGCACHE_SIZE, sizeof(lm_tgcache_entry_t)); for (i = 0; i < LM_TGCACHE_SIZE; i++) lm->tgcache[i].lwid[0] = BAD_S3LMWID; return lm;}/* * Free stale bigram and trigram info, those not used since last reset. */void lm_cache_reset (lm_t *lm){ int32 i, n_bgfree, n_tgfree; tginfo_t *tginfo, *next_tginfo, *prev_tginfo; int32 isLM_IN_MEMORY=0; n_bgfree = n_tgfree = 0; if (cmd_ln_int32 ("-lminmemory")) isLM_IN_MEMORY = 1; else isLM_IN_MEMORY = 0; /* ARCHAN: RAH only short-circult this function only */ if (isLM_IN_MEMORY) /* RAH We are going to short circuit this if we are running with the lm in memory */ return; if ((lm->n_bg > 0) && (! lm->bg)) { /* Disk-based; free "stale" bigrams */ for (i = 0; i < lm->n_ug; i++) { if (lm->membg[i].bg && (! lm->membg[i].used)) { lm->n_bg_inmem -= lm->ug[i+1].firstbg - lm->ug[i].firstbg; free (lm->membg[i].bg); lm->membg[i].bg = NULL; n_bgfree++; } lm->membg[i].used = 0; } } if (lm->n_tg > 0) { for (i = 0; i < lm->n_ug; i++) { prev_tginfo = NULL; for (tginfo = lm->tginfo[i]; tginfo; tginfo = next_tginfo) { next_tginfo = tginfo->next; if (! tginfo->used) { if ((! lm->tg) && tginfo->tg) { lm->n_tg_inmem -= tginfo->n_tg; free (tginfo->tg); n_tgfree++; } free (tginfo); if (prev_tginfo) prev_tginfo->next = next_tginfo; else lm->tginfo[i] = next_tginfo; } else { tginfo->used = 0; prev_tginfo = tginfo; } } } } if ((n_tgfree > 0) || (n_bgfree > 0)) { E_INFO("%d tg frees, %d in mem; %d bg frees, %d in mem\n", n_tgfree, lm->n_tg_inmem, n_bgfree, lm->n_bg_inmem); }}void lm_cache_stats_dump (lm_t *lm){ E_INFO("%9d tg(), %9d tgcache, %8d bo; %5d fills, %8d in mem (%.1f%%)\n", lm->n_tg_score, lm->n_tgcache_hit, lm->n_tg_bo, lm->n_tg_fill, lm->n_tg_inmem, (lm->n_tg_inmem*100.0)/(lm->n_tg+1)); E_INFO("%8d bg(), %8d bo; %5d fills, %8d in mem (%.1f%%)\n", lm->n_bg_score, lm->n_bg_bo, lm->n_bg_fill, lm->n_bg_inmem, (lm->n_bg_inmem*100.0)/(lm->n_bg+1)); lm->n_tgcache_hit = 0; lm->n_tg_fill = 0; lm->n_tg_score = 0; lm->n_tg_bo = 0; lm->n_bg_fill = 0; lm->n_bg_score = 0; lm->n_bg_bo = 0;}int32 lm_ug_score (lm_t *lm, s3lmwid_t lwid, s3wid_t wid){ if (NOT_S3LMWID(lwid) || (lwid >= lm->n_ug)) E_FATAL("Bad argument (%d) to lm_ug_score\n", lwid); lm->access_type = 1; if(lm->inclass_ugscore) return (lm->ug[lwid].prob.l +lm->inclass_ugscore[wid]); else return (lm->ug[lwid].prob.l );}int32 lm_uglist (lm_t *lm, ug_t **ugptr){ *ugptr = lm->ug; return (lm->n_ug);}/* This create a mapping from either the unigram or words in a class*/int32 lm_ug_wordprob (lm_t *lm, dict_t *dict,int32 th, wordprob_t *wp){ int32 i, j, n, p; s3wid_t w,dictid; lmclass_t lmclass; lmclass_word_t lm_cw; n = lm->n_ug; for (i = 0, j = 0; i < n; i++) { w = lm->ug[i].dictwid; if (IS_S3WID(w)) { /*Is w>0? Then it can be either wid or class id*/ if (w < LM_CLASSID_BASE){ /*It is just a word*/ if ((p = lm->ug[i].prob.l) >= th) { wp[j].wid = w; wp[j].prob = p; j++; } }else{ /* It is a class */ lmclass=LM_CLASSID_TO_CLASS(lm,w); /* Get the class*/ lm_cw=lmclass_firstword(lmclass); while(lmclass_isword(lm_cw)){ dictid =lmclass_getwid(lm_cw); /*E_INFO("Lookup dict_id using dict_basewid %d\n",dictid);*/ if(IS_S3WID(dictid)){ if(dictid !=dict_basewid(dict,dictid)){ dictid=dict_basewid(dict,dictid); } if((p=lm->ug[i].prob.l+lm->inclass_ugscore[dictid])>=th){ wp[j].wid=dictid; wp[j].prob=lm->ug[i].prob.l; j++; } }else{ E_INFO("Word %s cannot be found \n", lmclass_getword(lm_cw)); } lm_cw= lmclass_nextword (lmclass,lm_cw); } } } } return j;}/* * Load bigrams for the given unigram (LMWID) lw1 from disk into memory */static void load_bg (lm_t *lm, s3lmwid_t lw1){ int32 i, n, b; bg_t *bg; int32 isLM_IN_MEMORY=0; b = lm->ug[lw1].firstbg; /* Absolute first bg index for ug lw1 */ n = lm->ug[lw1+1].firstbg - b; /* Not including guard/sentinel */ if (cmd_ln_int32 ("-lminmemory")) isLM_IN_MEMORY = 1; else isLM_IN_MEMORY = 0; if (isLM_IN_MEMORY) /* RAH, if LM_IN_MEMORY, then we don't need to go get it. */ bg = lm->membg[lw1].bg = &lm->bg[b]; else { bg = lm->membg[lw1].bg = (bg_t *) ckd_calloc (n+1, sizeof(bg_t)); if (fseek (lm->fp, lm->bgoff + b*sizeof(bg_t), SEEK_SET) < 0) E_FATAL_SYSTEM ("fseek failed\n"); /* Need to read n+1 because obtaining tg count for one bg also depends on next bg */ if (fread (bg, sizeof(bg_t), n+1, lm->fp) != (size_t)(n+1)) E_FATAL("fread failed\n"); if (lm->byteswap) { for (i = 0; i <= n; i++) { SWAP_INT16(&(bg[i].wid)); SWAP_INT16(&(bg[i].probid)); SWAP_INT16(&(bg[i].bowtid)); SWAP_INT16(&(bg[i].firsttg)); } } } lm->n_bg_fill++; lm->n_bg_inmem += n;}#define BINARY_SEARCH_THRESH 16
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -