📄 init_ngram.c
字号:
/** * @file init_ngram.c * * <JA> * @brief N-gramファイルをメモリに粕み哈み帽胳辑今と滦炳を艰る * </JA> * * <EN> * @brief Load N-gram file into memory and setup with word dictionary * </EN> * * @author Akinobu LEE * @date Wed Feb 16 07:40:53 2005 * * $Revision: 1.6 $ * *//* * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology * All rights reserved */#include <sent/stddefs.h>#include <sent/ngram2.h>#include <sent/vocabulary.h>/** * Read and setup N-gram data from binary format file. * * @param ndata [out] pointer to N-gram data structure to store the data * @param bin_ngram_file [in] file name of the binary N-gram */booleaninit_ngram_bin(NGRAM_INFO *ndata, char *bin_ngram_file){ FILE *fp; jlog("Stat: init_ngram: reading in binary n-gram from %s\n", bin_ngram_file); if ((fp = fopen_readfile(bin_ngram_file)) == NULL) { jlog("Error: init_ngram: failed to open \"%s\"\n", bin_ngram_file); return FALSE; } if (ngram_read_bin(fp, ndata) == FALSE) { jlog("Error: init_ngram: failed to read \"%s\"\n", bin_ngram_file); return FALSE; } if (fclose_readfile(fp) == -1) { jlog("Error: init_ngram: failed to close \"%s\"\n", bin_ngram_file); return FALSE; } jlog("Stat: init_ngram: finished reading n-gram\n"); return TRUE;}/** * Read and setup N-gram data from ARPA format file. * * @param ndata [out] pointer to N-gram data structure to store the data * @param ngram_file [in] file name of ARPA (reverse) 3-gram file * @param dir [in] direction (DIR_LR | DIR_RL) */booleaninit_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir){ FILE *fp; ndata->root = NULL; ndata->dir = dir; jlog("Stat: init_ngram: reading in ARPA %s n-gram from %s\n", (ndata->dir == DIR_LR) ? "forward" : "backward", ngram_file); /* read RL n-gram */ if ((fp = fopen_readfile(ngram_file)) == NULL) { jlog("Error: init_ngram: failed to open \"%s\"\n", ngram_file); return FALSE; } if (ngram_read_arpa(fp, ndata, FALSE) == FALSE) { jlog("Error: init_ngram: failed to read \"%s\"\n", ngram_file); return FALSE; } if (fclose_readfile(fp) == -1) { jlog("Error: init_ngram: failed to close \"%s\"\n", ngram_file); return FALSE; } jlog("Stat: init_ngram: finished reading n-gram\n"); return TRUE;}/** * Read additional LR 2-gram for 1st pass. * * @param ndata [out] pointer to N-gram data structure to store the data * @param bigram_file [in] file name of ARPA 2-gram file */booleaninit_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file){ FILE *fp; jlog("Stat: init_ngram: reading in additional LR 2-gram for the 1st pass from %s\n", bigram_file); if ((fp = fopen_readfile(bigram_file)) == NULL) { jlog("Error: init_ngram: failed to open \"%s\"\n", bigram_file); return FALSE; } if (ngram_read_arpa(fp, ndata, TRUE) == FALSE) { jlog("Error: init_ngram: failed to read \"%s\"\n", bigram_file); return FALSE; } if (fclose_readfile(fp) == -1) { jlog("Error: init_ngram: failed to close \"%s\"\n", bigram_file); return FALSE; } jlog("Stat: init_ngram: finished reading LR 2-gram\n"); return TRUE;}/** * Make correspondence between word dictionary and N-gram vocabulary. * * @param ndata [i/o] word/class N-gram, the unknown word information will be set. * @param winfo [i/o] word dictionary, the word-to-ngram-entry mapping will be done here. */booleanmake_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo){ int i; boolean ok_flag = TRUE; int count = 0; jlog("Stat: init_ngram: mapping dictonary words to n-gram entries\n"); ndata->unk_num = 0; for (i = 0; i < winfo->num; i++) { winfo->wton[i] = make_ngram_ref(ndata, winfo->wname[i]); if (winfo->wton[i] == WORD_INVALID) { ok_flag = FALSE; count++; continue; } if (winfo->wton[i] == ndata->unk_id) { (ndata->unk_num)++; } } if (ok_flag == FALSE) { jlog("Error: --- Failed to map %d words in dictionary to N-gram\n", count); jlog("Error: --- Specify the word to which those words are mapped with \"-mapunk\" (default: \"<unk>\" or \"<UNK>\"\n"); return FALSE; } if (ndata->unk_num == 0) { ndata->unk_num_log = 0.0; /* for safe */ } else { ndata->unk_num_log = (float)log10(ndata->unk_num); } jlog("Stat: init_ngram: finished word-to-ngram mapping\n"); return TRUE;}/** * @brief Set unknown word ID to the N-gram data. * * * @param ndata [out] N-gram data to set unknown word ID. * @param str [in] word name string of unknown word */voidset_unknown_id(NGRAM_INFO *ndata, char *str){ ndata->unk_id = ngram_lookup_word(ndata, str); if (ndata->unk_id == WORD_INVALID) { if (strmatch(str, UNK_WORD_DEFAULT)) { /* if default "<unk>" is not found, also try "<UNK>" */ ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2); if (ndata->unk_id == WORD_INVALID) { jlog("Stat: init_ngram: either \"%s\" and \"%s\" not found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2); ndata->isopen = FALSE; return; } } } if (ndata->unk_id == WORD_INVALID) { jlog("Stat: init_ngram: \"%s\" not found, assuming close vocabulary LM\n", str); ndata->isopen = FALSE; } else { jlog("Stat: init_ngram: unknown words will be mapped to \"%s\"\n", str); ndata->isopen = TRUE; }}/** * @brief Fix unigram probability of BOS / EOS word. * * This function checks the probabilities of BOS / EOS word, and * if it is set to "-99", give the same as another one. * This is the case when the LM is trained by SRILM, which assigns * unigram probability of "-99" to the beginning-of-sentence word, * and causes search on reverse direction to fail. * * @param ndata [i/o] N-gram data * @param winfo [i/o] Vocabulary information * */voidfix_uniprob_srilm(NGRAM_INFO *ndata, WORD_INFO *winfo){ WORD_ID wb, we; wb = winfo->wton[winfo->head_silwid]; we = winfo->wton[winfo->tail_silwid]; if (ndata->d[0].prob[wb] == -99.0) { jlog("Warning: BOS word \"%s\" has unigram prob of \"-99\"\n", ndata->wname[wb]); jlog("Warning: assigining value of EOS word \"%s\": %f\n", ndata->wname[we], ndata->d[0].prob[we]); ndata->d[0].prob[wb] = ndata->d[0].prob[we]; } else if (ndata->d[0].prob[we] == -99.0) { jlog("Warning: EOS word \"%s\" has unigram prob of \"-99\"\n", ndata->wname[we]); jlog("Warning: assigining value of BOS word \"%s\": %f\n", ndata->wname[wb], ndata->d[0].prob[wb]); ndata->d[0].prob[we] = ndata->d[0].prob[wb]; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -