📄 lm.c
字号:
/* Locate a specific bigram within a bigram list */static int32 find_bg (bg_t *bg, int32 n, s3lmwid_t w){ int32 i, b, e; /* Binary search until segment size < threshold */ b = 0; e = n; while (e-b > BINARY_SEARCH_THRESH) { i = (b+e)>>1; if (bg[i].wid < w) b = i+1; else if (bg[i].wid > w) e = i; else return i; } /* Linear search within narrowed segment */ for (i = b; (i < e) && (bg[i].wid != w); i++); return ((i < e) ? i : -1);}int32 lm_bglist (lm_t *lm, s3lmwid_t w1, bg_t **bgptr, int32 *bowt){ int32 n; if (NOT_S3LMWID(w1) || (w1 >= lm->n_ug)) E_FATAL("Bad w1 argument (%d) to lm_bglist\n", w1); n = (lm->n_bg > 0) ? lm->ug[w1+1].firstbg - lm->ug[w1].firstbg : 0; if (n > 0) { if (! lm->membg[w1].bg) load_bg (lm, w1); lm->membg[w1].used = 1; *bgptr = lm->membg[w1].bg; *bowt = lm->ug[w1].bowt.l; } else { *bgptr = NULL; *bowt = 0; } return (n);}#if 0 /*Obsolete, not used */int32 lm_bg_wordprob (lm_t *lm, s3lmwid_t lwid, int32 th, wordprob_t *wp, int32 *bowt){ bg_t *bgptr; int32 i, j, n, ugprob, bgprob; s3wid_t w; n = lm_bglist (lm, lwid, &bgptr, bowt); ugprob = lm_ug_score (lm, lwid); /* Convert bglist to wordprob */ for (i = 0, j = 0; i < n; i++, bgptr++) { w = lm->ug[bgptr->wid].dictwid; if (IS_S3WID (w)) { bgprob = LM_BGPROB(lm, bgptr); if (ugprob + bgprob >= th) { /* ABSOLUTE prob (count) >= min thresh */ wp[j].wid = w; wp[j].prob = bgprob; j++; } } } return j;}#endif/* * This function look-ups the bigram score of p(lw2|lw1) * The information for lw2 and w2 are repeated because the legacy * implementation(since s3.2) of vithist used only LM wid rather * than dictionary wid. */int32 lm_bg_score (lm_t *lm, s3lmwid_t lw1, s3lmwid_t lw2, s3wid_t w2){ int32 i, n, score; bg_t *bg=0; if ((lm->n_bg == 0) || (NOT_S3LMWID(lw1))) return (lm_ug_score (lm, lw2, w2)); lm->n_bg_score++; if (NOT_S3LMWID(lw2) || (lw2 >= lm->n_ug)) E_FATAL("Bad lw2 argument (%d) to lm_bg_score\n", lw2); n = lm->ug[lw1+1].firstbg - lm->ug[lw1].firstbg; if (n > 0) { if (! lm->membg[lw1].bg) load_bg (lm, lw1); lm->membg[lw1].used = 1; bg = lm->membg[lw1].bg; i = find_bg (bg, n, lw2); } else i = -1; if (i >= 0) { score = lm->bgprob[bg[i].probid].l; if(lm->inclass_ugscore){ /*Only add within class prob if class information exists. Is actually ok to just add the score because if the word is not within-class. The returning scores will be 0. I just love to safe-guard it :-). */ score += lm->inclass_ugscore[w2]; } lm->access_type = 2; } else { lm->n_bg_bo++; lm->access_type = 1; score = lm->ug[lw1].bowt.l + lm->ug[lw2].prob.l; }#if 0 printf (" %5d %5d -> %8d\n", lw1, lw2, score);#endif return (score);}static void load_tg (lm_t *lm, s3lmwid_t lw1, s3lmwid_t lw2){ int32 i, n, b; int32 t = -1; /* Let's make sure that if t isn't initialized after the * "if" statement below, it makes things go bad */ int32 isLM_IN_MEMORY=0; bg_t *bg; tg_t *tg; tginfo_t *tginfo; if (cmd_ln_int32 ("-lminmemory")) isLM_IN_MEMORY = 1; else isLM_IN_MEMORY = 0; /* First allocate space for tg information for bg lw1,lw2 */ tginfo = (tginfo_t *) ckd_malloc (sizeof(tginfo_t)); tginfo->w1 = lw1; tginfo->tg = NULL; tginfo->next = lm->tginfo[lw2]; lm->tginfo[lw2] = tginfo; /* Locate bigram lw1,lw2 */ b = lm->ug[lw1].firstbg; n = lm->ug[lw1+1].firstbg - b; /* Make sure bigrams for lw1, if any, loaded into memory */ if (n > 0) { if (! lm->membg[lw1].bg) load_bg (lm, lw1); lm->membg[lw1].used = 1; bg = lm->membg[lw1].bg; } /* At this point, n = #bigrams for lw1 */ if ((n > 0) && ((i = find_bg (bg, n, lw2)) >= 0)) { tginfo->bowt = lm->tgbowt[bg[i].bowtid].l; /* Find t = Absolute first trigram index for bigram lw1,lw2 */ b += i; /* b = Absolute index of bigram lw1,lw2 on disk */ t = lm->tg_segbase[b >> lm->log_bg_seg_sz]; t += bg[i].firsttg; /* Find #tg for bigram w1,w2 */ n = lm->tg_segbase[(b+1) >> lm->log_bg_seg_sz]; n += bg[i+1].firsttg; n -= t; tginfo->n_tg = n; } else { /* No bigram w1,w2 */ tginfo->bowt = 0; n = tginfo->n_tg = 0; } /* "t" has not been assigned any meanigful value, so if you use it * beyond this point, make sure it's been properly assigned. */ // assert (t != -1); /* At this point, n = #trigrams for lw1,lw2. Read them in */ if (isLM_IN_MEMORY) { /* RAH, already have this in memory */ if (n > 0){ assert(t != -1); tg = tginfo->tg = &lm->tg[t]; } } else { if (n > 0) { tg = tginfo->tg = (tg_t *) ckd_calloc (n, sizeof(tg_t)); if (fseek (lm->fp, lm->tgoff + t*sizeof(tg_t), SEEK_SET) < 0) E_FATAL_SYSTEM("fseek failed\n"); if (fread (tg, sizeof(tg_t), n, lm->fp) != (size_t)n) E_FATAL("fread(tg, %d at %d) failed\n", n, lm->tgoff); if (lm->byteswap) { for (i = 0; i < n; i++) { SWAP_INT16(&(tg[i].wid)); SWAP_INT16(&(tg[i].probid)); } } } } lm->n_tg_fill++; lm->n_tg_inmem += n;}/* Similar to find_bg */static int32 find_tg (tg_t *tg, int32 n, s3lmwid_t w){ int32 i, b, e; b = 0; e = n; while (e-b > BINARY_SEARCH_THRESH) { i = (b+e)>>1; if (tg[i].wid < w) b = i+1; else if (tg[i].wid > w) e = i; else return i; } for (i = b; (i < e) && (tg[i].wid != w); i++); return ((i < e) ? i : -1);}int32 lm_tglist (lm_t *lm, s3lmwid_t lw1, s3lmwid_t lw2, tg_t **tgptr, int32 *bowt){ tginfo_t *tginfo, *prev_tginfo; if (lm->n_tg <= 0) { *tgptr = NULL; *bowt = 0; return 0; } if (NOT_S3LMWID(lw1) || (lw1 >= lm->n_ug)) E_FATAL("Bad lw1 argument (%d) to lm_tglist\n", lw1); if (NOT_S3LMWID(lw2) || (lw2 >= lm->n_ug)) E_FATAL("Bad lw2 argument (%d) to lm_tglist\n", lw2); prev_tginfo = NULL; for (tginfo = lm->tginfo[lw2]; tginfo; tginfo = tginfo->next) { if (tginfo->w1 == lw1) break; prev_tginfo = tginfo; } if (! tginfo) { load_tg (lm, lw1, lw2); tginfo = lm->tginfo[lw2]; } else if (prev_tginfo) { prev_tginfo->next = tginfo->next; tginfo->next = lm->tginfo[lw2]; lm->tginfo[lw2] = tginfo; } tginfo->used = 1; *tgptr = tginfo->tg; *bowt = tginfo->bowt; return (tginfo->n_tg);}/* * This function look-ups the trigram score of p(lw3|lw2,lw1) * and compute the in-class ug probability of w3. * The information for lw3 and w3 are repeated because the legacy * implementation(since s3.2) of vithist used only LM wid rather * than dictionary wid. * */int32 lm_tg_score (lm_t *lm, s3lmwid_t lw1, s3lmwid_t lw2, s3lmwid_t lw3, s3wid_t w3){ int32 i, h, n, score; tg_t *tg; tginfo_t *tginfo, *prev_tginfo; if ((lm->n_tg == 0) || (NOT_S3LMWID(lw1))) return (lm_bg_score (lm, lw2, lw3, w3)); lm->n_tg_score++; if (NOT_S3LMWID(lw1) || (lw1 >= lm->n_ug)) E_FATAL("Bad lw1 argument (%d) to lm_tg_score\n", lw1); if (NOT_S3LMWID(lw2) || (lw2 >= lm->n_ug)) E_FATAL("Bad lw2 argument (%d) to lm_tg_score\n", lw2); if (NOT_S3LMWID(lw3) || (lw3 >= lm->n_ug)) E_FATAL("Bad lw3 argument (%d) to lm_tg_score\n", lw3); /* Lookup tgcache first; compute hash(lw1, lw2, lw3) */ h = ((lw1 & 0x000003ff) << 21) + ((lw2 & 0x000003ff) << 11) + (lw3 & 0x000007ff); h %= LM_TGCACHE_SIZE; if ((lm->tgcache[h].lwid[0] == lw1) && (lm->tgcache[h].lwid[1] == lw2) && (lm->tgcache[h].lwid[2] == lw3)) { lm->n_tgcache_hit++; return lm->tgcache[h].lscr; } prev_tginfo = NULL; for (tginfo = lm->tginfo[lw2]; tginfo; tginfo = tginfo->next) { if (tginfo->w1 == lw1) break; prev_tginfo = tginfo; } if (! tginfo) { load_tg (lm, lw1, lw2); tginfo = lm->tginfo[lw2]; } else if (prev_tginfo) { prev_tginfo->next = tginfo->next; tginfo->next = lm->tginfo[lw2]; lm->tginfo[lw2] = tginfo; } tginfo->used = 1; /* Trigrams for w1,w2 now in memory; look for w1,w2,w3 */ n = tginfo->n_tg; tg = tginfo->tg; if ((i = find_tg (tg, n, lw3)) >= 0) { score = lm->tgprob[tg[i].probid].l ; if(lm->inclass_ugscore){ /*Only add within class prob if class information exists. Is actually ok to just add the score because if the word is not within-class. The returning scores will be 0. */ score += lm->inclass_ugscore[w3]; } lm->access_type = 3; } else { lm->n_tg_bo++; score = tginfo->bowt + lm_bg_score(lm, lw2, lw3, w3); }#if 0 printf ("%5d %5d %5d -> %8d\n", lw1, lw2, lw3, score);#endif lm->tgcache[h].lwid[0] = lw1; lm->tgcache[h].lwid[1] = lw2; lm->tgcache[h].lwid[2] = lw3; lm->tgcache[h].lscr = score; return (score);}s3lmwid_t lm_wid (lm_t *lm, char *word){ int32 i; for (i = 0; i < lm->n_ug; i++) if (strcmp (lm->wordstr[i], word) == 0) return ((s3lmwid_t) i); return BAD_S3LMWID;}void lm_free (lm_t *lm){ int i; for (i=0;i<lm->n_ug;i++) ckd_free ((void *) lm->wordstr[i]); /* */ ckd_free ((void *) lm->membg); ckd_free ((void *) lm->wordstr); ckd_free ((void *) lm->tgcache); ckd_free ((void *) lm->tg_segbase); ckd_free ((void *) lm->tgprob); ckd_free ((void *) lm->tgbowt); ckd_free ((void *) lm->bgprob); ckd_free ((void *) lm->tginfo); ckd_free ((void *) lm->ug); ckd_free ((void *) lm); }int32 lm_rawscore (lm_t *lm, int32 score, float64 lwf){ if (lwf != 1.0) score /= (int32)lwf; score -= lm->wip; score /= (int32)lm->lw; return score;}#if (_LM_TEST_)static int32 sentence_lmscore (lm_t *lm, char *line){ char *word[1024]; s3lmwid_t w[1024]; int32 nwd, score, tgscr; int32 i, j; if ((nwd = str2words (line, word, 1020)) < 0) E_FATAL("Increase word[] and w[] arrays size\n"); w[0] = BAD_S3LMWID; w[1] = lm_wid (lm, S3_START_WORD); if (NOT_S3LMWID(w[1])) E_FATAL("Unknown word: %s\n", S3_START_WORD); for (i = 0; i < nwd; i++) { w[i+2] = lm_wid (lm, word[i]); if (NOT_S3LMWID(w[i+2])) { E_ERROR("Unknown word: %s\n", word[i]); return 0; } } w[i+2] = lm_wid (lm, S3_FINISH_WORD); if (NOT_S3LMWID(w[i+2])) E_FATAL("Unknown word: %s\n", S3_FINISH_WORD); score = 0; for (i = 0, j = 2; i <= nwd; i++, j++) { tgscr = lm_tg_score (lm, w[j-2], w[j-1], w[j]); score += tgscr; printf ("\t%10d %s\n", tgscr, lm->wordstr[w[j]]); } return (score);}main (int32 argc, char *argv[]){ char line[4096]; int32 score, k; lm_t *lm; if (argc < 2) E_FATAL("Usage: %s <LMdumpfile>\n", argv[0]); logs3_init (1.0001); lm = lm_read (argv[1], 9.5, 0.2); if (1) { /* Short cut this so we can test for memory leaks */ for (;;) { printf ("> "); if (fgets (line, sizeof(line), stdin) == NULL) break; score = sentence_lmscore (lm, line); k = strlen(line); if (line[k-1] == '\n') line[k-1] = '\0'; printf ("LMScr(%s) = %d\n", line, score); } } /* */ lm_free(lm); exit (0);}#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -