📄 ngram_read_arpa.c
字号:
/** * @file ngram_read_arpa.c * @author Akinobu LEE * @date Wed Feb 16 16:52:24 2005 * * <JA> * @brief ARPA妨及のN-gramファイルを粕み哈む * * ARPA妨及のN-gramファイルを脱いる眷圭·2-gram と嫡羹き 3-gram を * それぞれ侍」のファイルから粕み哈みますˉ * </JA> * * <EN> * @brief Read ARPA format N-gram files * * When N-gram data is given in ARPA format, both 2-gram file and * reverse 3-gram file should be specified. * </EN> * * @sa ngram2.h * * $Revision: 1.6 $ * *//* * Copyright (c) 1991-2006 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2006 Julius project team, Nagoya Institute of Technology * All rights reserved *//* $Id: ngram_read_arpa.c,v 1.6 2006/11/10 02:27:43 sumomo Exp $ *//* words should be alphabetically sorted */#include <sent/stddefs.h>#include <sent/ngram2.h>static char buf[800]; ///< Local buffer for readingstatic char pbuf[800]; ///< Local buffer for error string /** * Get N-gram word/class id of a string, and terminate program if not found. * * @param ndata [in] N-gram data * @param str [in] name string of N-gram entry * * @return the entry ID. */static WORD_IDlookup_word(NGRAM_INFO *ndata, char *str){ WORD_ID wid; if ((wid = ngram_lookup_word(ndata, str)) == WORD_INVALID) { j_error("word %s not in N-gram vocabulary.\n",str); } return wid;}/** * @brief Set unknown word ID to the N-gram data. * * In CMU-Cam SLM toolkit, OOV words are always mapped to UNK, which * always appear at the very beginning of N-gram entry, so we fix the * unknown word ID at "0". * * @param ndata [out] N-gram data to set unknown word ID. */voidset_unknown_id(NGRAM_INFO *ndata){#if 0 ndata->unk_id = ngram_lookup_word(ndata, unkword); if (ndata->unk_id == WORD_INVALID) { j_printerr("word %s not found, so assume this is a closed vocabulary model\n", unkword); ndata->isopen = FALSE; } else { ndata->isopen = TRUE; }#endif ndata->isopen = TRUE; ndata->unk_id = 0; /* unknown (OOV) words are always mapped to the number 0 (by CMU-TK)*/}/** * Set number of N-gram entries, for reading the first LR 2-gram. * * @param fp [in] file pointer * @param ndata [out] N-gram data to set it. */static voidset_total_info(FILE *fp, NGRAM_INFO *ndata){ char *p; int n; while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') { if (strnmatch(buf, "ngram", 5)) { /* n-gram num */ p = strtok(buf, "="); n = p[strlen(p)-1] - '0' - 1; p = strtok(NULL, "="); ndata->ngram_num[n] = atoi(p); } }}/* read total info and check it with LR data (RL) *//** * Read number of N-gram entries of the second RL 3-gram, and * check if those values are exactly the same as the previous LR values. * * @param fp [in] file pointer * @param ndata [i/o] N-gram data */static voidset_and_check_total_info(FILE *fp, NGRAM_INFO *ndata){ char *p; int n; while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') { if (strnmatch(buf, "ngram", 5)) { /* n-gram num */ p = strtok(buf, "="); n = p[strlen(p)-1] - '0' - 1; p = strtok(NULL, "=");/* * if (n <= 2 && ndata->ngram_num[n] != atoi(p)) { * j_printerr("LR and RL don't match at ngram_num!\n"); * j_error("cut-off value when building LM differ?\n"); * } */ if (n == 2) { /* 3-gram */ ndata->ngram_num[n] = atoi(p); } else { if (n <= 1 && ndata->ngram_num[n] != atoi(p)) { j_printerr("Warning: %d-gram total num differ! may cause read error\n",n+1); } } } }}/** * Read word/class entry names and 1-gram data from LR 2-gram file. * * @param fp [in] file pointer * @param ndata [out] N-gram to set the read data. */static voidset_unigram(FILE *fp, NGRAM_INFO *ndata){ WORD_ID read_word_num; /* # of words already read */ WORD_ID nid, resid; LOGPROB prob, bo_wt; char *name, *p; /* malloc area */ ndata->wname = (char **)mymalloc(sizeof(char *)*ndata->ngram_num[0]); ndata->p = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[0]); ndata->bo_wt_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[0]); ndata->bo_wt_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[0]); ndata->n2_bgn = (NNID *)mymalloc(sizeof(NNID)*ndata->ngram_num[0]); ndata->n2_num = (WORD_ID *)mymalloc(sizeof(WORD_ID)*ndata->ngram_num[0]); read_word_num = 0; while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') { prob = (LOGPROB)atof(first_token(buf)); p = next_token(); name = strcpy((char *)mymalloc(strlen(p)+1), p); bo_wt = (LOGPROB)atof(next_token()); /* register unigram */ nid = read_word_num; ndata->wname[nid] = name; /* add entry name to index tree */ if (ndata->root == NULL) { ndata->root = ptree_make_root_node(nid); } else { resid = ptree_search_data(name, ndata->root); if (strmatch(name, ndata->wname[resid])) { /* already exist */ j_error("Error: word \"%s\" multiply defined at (#%d and #%d)\n", name, resid, nid); } else { ptree_add_entry(name, nid, ndata->wname[resid], &(ndata->root)); } } ndata->p[nid] = prob; ndata->bo_wt_lr[nid] = bo_wt; ndata->n2_bgn[nid] = NNID_INVALID; ndata->n2_num[nid] = 0; read_word_num++; if (read_word_num > ndata->max_word_num) { j_printerr("Error: actual n-gram word num exceeded header value\n"); j_error("%d > %d\n", read_word_num, ndata->max_word_num); } } if (read_word_num != ndata->ngram_num[0]) { j_printerr("Error: actual n-gram word num not match the header value\n"); j_error("%d != %d ?\n", read_word_num, ndata->ngram_num[0]); } j_printerr(" 1-gram read %d end\n", read_word_num);}/* read-in 1-gram (RL) --- only add back-off weight *//** * Read 1-gram data from RL 3-gram file. Only the back-off weights are * stored. * * @param fp [in] file pointer * @param ndata [out] N-gram to store the read data. */static voidadd_unigram(FILE *fp, NGRAM_INFO *ndata){ WORD_ID read_word_num; WORD_ID nid; LOGPROB prob, bo_wt; char *name, *p; read_word_num = 0; while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') { prob = atof(first_token(buf)); p = next_token(); name = strcpy((char *)mymalloc(strlen(p)+1), p); bo_wt = (LOGPROB)atof(next_token()); /* add bo_wt_rl to existing 1-gram entry */ nid = lookup_word(ndata, name); if (nid == WORD_INVALID) { j_printerr("Warning: n-gram word \"%s\" in RL not exist in LR (ignored)\n", name); } else { ndata->bo_wt_rl[nid] = bo_wt; } read_word_num++; if (read_word_num > ndata->max_word_num) { j_printerr("Error: actual n-gram word num exceeded header value\n"); j_error("%d > %d\n", read_word_num, ndata->max_word_num); } free(name); } j_printerr(" 1-gram read %d end\n", read_word_num); }/** * Read 2-gram data from LR 2-gram file. * * @param fp [in] file pointer * @param ndata [out] N-gram to set the read data. */static voidset_bigram(FILE *fp, NGRAM_INFO *ndata){ int w_l, w_r; int w_last, w_r_last; LOGPROB p; NNID n2; ndata->n2tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID)*ndata->ngram_num[1]); ndata->p_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[1]); ndata->p_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[1]); ndata->bo_wt_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[1]); n2 = 0; /* read in LR 2-gram */ w_last = -1; w_r_last = -1; for (;;) { if (getl(buf, sizeof(buf), fp) == NULL || buf[0] == '\\') break; strcpy(pbuf, buf); if ( n2 % 100000 == 0) { j_printerr(" 2-gram read %d (%d%%)\n", n2, n2 * 100 / ndata->ngram_num[1]); } /* 2-gram probability */ p = (LOGPROB)atof(first_token(buf)); /* read in left (context) word and lookup the ID */ w_l = lookup_word(ndata, next_token()); /* increment n2_bgn and n2_num if context word changed */ if (w_l != w_last) { if (w_last != -1) ndata->n2_num[w_last] = n2 - ndata->n2_bgn[w_last]; /* the next context word should be an new entry */ if (ndata->n2_bgn[w_l] != NNID_INVALID) { j_printerr("Error: entry not sorted (same left context not sequenced)\n"); j_error("at 2-gram #%d: \"%s\"\n", n2+1, pbuf); } ndata->n2_bgn[w_l] = n2; w_r_last = -1; } /* read in right word and set */ w_r = lookup_word(ndata, next_token()); if (w_r == w_r_last) { j_printerr("Error: duplicated entry\n"); j_error("at 2-gram #%d: \"%s\"\n", n2+1, pbuf); } else if (w_r < w_r_last) { j_printerr("Error: entry not sorted downward\n"); j_error("at 2-gram #%d: \"%s\"\n", n2+1, pbuf); } ndata->n2tonid[n2] = w_r; ndata->p_lr[n2] = p; n2++; w_last = w_l; w_r_last = w_r; /* check total num */ if (n2 > ndata->ngram_num[1]) { j_printerr("Error: actual 2-gram num not match the header value\n"); j_error("%d != %d ?\n", n2, ndata->ngram_num[1]); } } /* set the last entry */ ndata->n2_num[w_last] = n2 - ndata->n2_bgn[w_last]; j_printerr(" 2-gram read %d end\n", n2);}/** * Read reverse 2-gram data from RL 3-gram file, and set RL 2-gram * probabilities and back-off values for RL 3-gram to the corresponding * LR 2-gram data. * * @param fp [in] file pointer * @param ndata [i/o] N-gram to set the read data. */static voidadd_bigram_rl(FILE *fp, NGRAM_INFO *ndata){ WORD_ID w_l, w_r; LOGPROB prob, bo_wt; int bi_count = 0; NNID n2; while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') { /* p(w_l|w_r) w_r w_l bo_wt_rl */ if ( ++bi_count % 100000 == 0) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -