📄 ngram_read_arpa.c
字号:
/** * @file ngram_read_arpa.c * * <JA> * @brief ARPA妨及のN-gramファイルを粕み哈む * * ARPA妨及のN-gramファイルを脱いる眷圭·2-gram と嫡羹き 3-gram を * それぞれ侍」のファイルから粕み哈みますˉ * </JA> * * <EN> * @brief Read ARPA format N-gram files * * When N-gram data is given in ARPA format, both 2-gram file and * reverse 3-gram file should be specified. * </EN> * * @sa ngram2.h * * @author Akinobu LEE * @date Wed Feb 16 16:52:24 2005 * * $Revision: 1.15 $ * *//* * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology * All rights reserved *//* $Id: ngram_read_arpa.c,v 1.15 2009/02/10 08:15:48 sumomo Exp $ *//* words should be alphabetically sorted */#include <sent/stddefs.h>#include <sent/ngram2.h>static char buf[800]; ///< Local buffer for readingstatic char pbuf[800]; ///< Local buffer for error string /** * Set number of N-gram entries, for reading the first LR 2-gram. * * @param fp [in] file pointer * @param num [out] set the values to this buffer * * @return the value of N, or -1 on error. */static intget_total_info(FILE *fp, NNID num[]){ char *p; int n; int maxn; unsigned long entry_num; maxn = 0; while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') { if (strnmatch(buf, "ngram", 5)) { /* n-gram num */ p = strtok(buf, "="); n = p[strlen(p)-1] - '0'; if (n > MAX_N) { jlog("Error: too long N-gram (N=%d)\n", n); jlog("Error: current maximum length of N-gram is set to %d\n", MAX_N); jlog("Error: you can expand the limit by setting MAX_N in \"sent/ngram.h\"\n"); return -1; } p = strtok(NULL, "="); //entry_num = atol(p); sscanf(p, "%lu", &entry_num); /* check maximum number */ if (entry_num > NNID_MAX) { jlog("Error: too big %d-gram (exceeds %d bit)\n", n, sizeof(NNID) * 8); return -1; } /* ignore empty entry */ if (entry_num == 0) { jlog("Warning: empty %d-gram, skipped\n", n); } else { num[n-1] = entry_num; if (maxn < n) maxn = n; } } } return(maxn);}/** * Read word/class entry names and 1-gram data from LR 2-gram file. * * @param fp [in] file pointer * @param ndata [out] N-gram to set the read data. */static booleanset_unigram(FILE *fp, NGRAM_INFO *ndata){ WORD_ID nid; int resid; LOGPROB prob, bo_wt; char *name, *p; boolean ok_p = TRUE; NGRAM_TUPLE_INFO *t; t = &(ndata->d[0]); /* malloc name area */ ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num); /* malloc data area */ //t->bgn_upper = t->bgn_lower = t->bgn = t->num = NULL; t->bgn_upper = NULL; t->bgn_lower = NULL; t->bgn = NULL; t->num = NULL; t->bgnlistlen = 0; t->nnid2wid = NULL; t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum); t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum); t->context_num = t->totalnum; t->nnid2ctid_upper = NULL; t->nnid2ctid_lower = NULL; nid = 0; while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') { if ((p = strtok(buf, DELM)) == NULL) { jlog("Error: ngram_read_arpa: 1-gram: failed to parse, corrupted or invalid data?\n"); return FALSE; } prob = (LOGPROB)atof(p); if ((p = strtok(NULL, DELM)) == NULL) { jlog("Error: ngram_read_arpa: 1-gram: failed to parse, corrupted or invalid data?\n"); return FALSE; } name = strcpy((char *)mymalloc(strlen(p)+1), p); if ((p = strtok(NULL, DELM)) == NULL) { bo_wt = 0.0; } else { bo_wt = (LOGPROB)atof(p); } /* register word entry name */ ndata->wname[nid] = name; /* add entry name to index tree */ if (ndata->root == NULL) { ndata->root = ptree_make_root_node(nid); } else { resid = ptree_search_data(name, ndata->root); if (resid != -1 && strmatch(name, ndata->wname[resid])) { /* already exist */ jlog("Error: ngram_read_arpa: duplicate word entry \"%s\" at #%d and #%d in 1-gram\n", name, resid, nid); ok_p = FALSE; continue; } else { ptree_add_entry(name, nid, ndata->wname[resid], &(ndata->root)); } } if (nid >= ndata->max_word_num) { jlog("Error: ngram_read_arpa: num of 1-gram is bigger than header value (%d)\n", ndata->max_word_num); return FALSE; } /* register entry info */ t->prob[nid] = prob; t->bo_wt[nid] = bo_wt; nid++; } if (nid != t->totalnum) { jlog("Error: ngram_read_arpa: num of 1-gram (%d) not equal to header value (%d)\n", nid, t->totalnum); return FALSE; } if (ok_p == TRUE) { jlog("Stat: ngram_read_arpa: read %d 1-gram entries\n", nid); } return ok_p;}/* read-in 1-gram (RL) --- only add back-off weight *//** * Read 1-gram data from RL 3-gram file. Only the back-off weights are * stored. * * @param fp [in] file pointer * @param ndata [out] N-gram to store the read data. */static booleanadd_unigram(FILE *fp, NGRAM_INFO *ndata){ WORD_ID read_word_num; WORD_ID nid; LOGPROB prob, bo_wt; char *name, *p; boolean ok_p = TRUE; boolean mismatched = FALSE; ndata->bo_wt_1 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->max_word_num); read_word_num = 0; while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') { if ((p = strtok(buf, DELM)) == NULL) { jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n"); return FALSE; } prob = atof(p); if ((p = strtok(NULL, DELM)) == NULL) { jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n"); return FALSE; } name = strcpy((char *)mymalloc(strlen(p)+1), p); if ((p = strtok(NULL, DELM)) == NULL) { bo_wt = 0.0; } else { bo_wt = (LOGPROB)atof(p); } /* add bo_wt_rl to existing 1-gram entry */ nid = ngram_lookup_word(ndata, name); if (nid == WORD_INVALID) { if (mismatched == FALSE) { jlog("Error: ngram_read_arpa: vocabulary mismatch between LR n-gram and RL n-gram\n"); mismatched = TRUE; } jlog("Error: ngram_read_arpa: \"%s\" does not appears in LR n-gram\n", name); ok_p = FALSE; } else { ndata->bo_wt_1[nid] = bo_wt; } read_word_num++; if (read_word_num > ndata->max_word_num) { jlog("Error: ngram_read_arpa: vocabulary size of RL n-gram is bigger than header value (%d)\n", ndata->max_word_num); return FALSE; } free(name); } if (ok_p == TRUE) { jlog("Stat: ngram_read_arpa: read %d 1-gram entries\n", read_word_num); } return ok_p;}/** * Read forward 2-gram data and set the LR 2-gram probabilities to the * already loaded RL N-gram. * * @param fp [in] file pointer * @param ndata [i/o] N-gram to set the read data. */static booleanadd_bigram(FILE *fp, NGRAM_INFO *ndata){ WORD_ID w[2], wtmp; LOGPROB prob; NNID bi_count = 0; NNID n2; boolean ok_p = TRUE; char *s; ndata->p_2 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->d[1].totalnum); while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') { strcpy(pbuf, buf); if ( ++bi_count % 100000 == 0) { jlog("Stat: ngram_read_arpa: 2-gram read %lu (%d%%)\n", bi_count, bi_count * 100 / ndata->d[1].totalnum); } if ((s = strtok(buf, DELM)) == NULL) { jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n"); return FALSE; } prob = (LOGPROB)atof(s); if ((s = strtok(NULL, DELM)) == NULL) { jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n"); return FALSE; } w[0] = ngram_lookup_word(ndata, s); if (w[0] == WORD_INVALID) { jlog("Error: ngram_read_arpa: 2-gram #%lu: \"%s\": \"%s\" not exist in 1-gram\n", bi_count, pbuf, s); ok_p = FALSE; continue; } if ((s = strtok(NULL, DELM)) == NULL) { jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n"); return FALSE; } w[1] = ngram_lookup_word(ndata, s); if (w[1] == WORD_INVALID) { jlog("Error: ngram_read_arpa: 2-gram #%lu: \"%s\": \"%s\" not exist in 1-gram\n", bi_count, pbuf, s); ok_p = FALSE; continue; } if (ndata->dir == DIR_RL) { /* word order should be reversed */ wtmp = w[0]; w[0] = w[1]; w[1] = wtmp; } n2 = search_ngram(ndata, 2, w); if (n2 == NNID_INVALID) { jlog("Warning: ngram_read_arpa: 2-gram #%d: \"%s\": (%s,%s) not exist in LR 2-gram (ignored)\n", n2+1, pbuf, ndata->wname[w[0]], ndata->wname[w[1]]); } else { ndata->p_2[n2] = prob; } } if (ok_p == TRUE) { jlog("Stat: ngram_read_arpa: 2-gram read %lu end\n", bi_count); } return ok_p;} /** * Read n-gram data for a given N from ARPA n-gram file. (n >= 2) * * @param fp [in] file pointer * @param ndata [out] N-gram to set the read data. */static booleanset_ngram(FILE *fp, NGRAM_INFO *ndata, int n){ NNID i; WORD_ID w[MAX_N]; WORD_ID w_last[MAX_N]; LOGPROB p, bowt; NNID nnid; NNID cid, cid_last; boolean ok_p = TRUE; char *s; NGRAM_TUPLE_INFO *t; NGRAM_TUPLE_INFO *tprev; NNID ntmp; if (n < 2) { jlog("Error: ngram_read_arpa: unable to process 1-gram\n"); return FALSE; } t = &(ndata->d[n-1]); tprev = &(ndata->d[n-2]); /* initialize pointer storage to access from (N-1)-gram */ t->bgnlistlen = tprev->context_num; if (t->is24bit) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -