⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ngram_read_arpa.c

📁 julius version 4.12.about sound recognition.
💻 C
📖 第 1 页 / 共 2 页
字号:
/** * @file   ngram_read_arpa.c *  * <JA> * @brief  ARPA妨及のN-gramファイルを粕み哈む * * ARPA妨及のN-gramファイルを脱いる眷圭·2-gram と嫡羹き 3-gram を * それぞれ侍」のファイルから粕み哈みますˉ * </JA> *  * <EN> * @brief  Read ARPA format N-gram files * * When N-gram data is given in ARPA format, both 2-gram file and * reverse 3-gram file should be specified. * </EN> * * @sa ngram2.h *  * @author Akinobu LEE * @date   Wed Feb 16 16:52:24 2005 * * $Revision: 1.15 $ *  *//* * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology * All rights reserved *//* $Id: ngram_read_arpa.c,v 1.15 2009/02/10 08:15:48 sumomo Exp $ *//* words should be alphabetically sorted */#include <sent/stddefs.h>#include <sent/ngram2.h>static char buf[800];			///< Local buffer for readingstatic char pbuf[800];			///< Local buffer for error string /**  * Set number of N-gram entries, for reading the first LR 2-gram. *  * @param fp [in] file pointer * @param num [out] set the values to this buffer * * @return the value of N, or -1 on error. */static intget_total_info(FILE *fp, NNID num[]){  char *p;  int n;  int maxn;  unsigned long entry_num;  maxn = 0;  while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {    if (strnmatch(buf, "ngram", 5)) { /* n-gram num */      p = strtok(buf, "=");      n = p[strlen(p)-1] - '0';      if (n > MAX_N) {	jlog("Error: too long N-gram (N=%d)\n", n);	jlog("Error: current maximum length of N-gram is set to %d\n", MAX_N);	jlog("Error: you can expand the limit by setting MAX_N in \"sent/ngram.h\"\n");	return -1;      }      p = strtok(NULL, "=");      //entry_num = atol(p);      sscanf(p, "%lu", &entry_num);      /* check maximum number */      if (entry_num > NNID_MAX) {	jlog("Error: too big %d-gram (exceeds %d bit)\n", n, sizeof(NNID) * 8);	return -1;      }      /* ignore empty entry */      if (entry_num == 0) {	jlog("Warning: empty %d-gram, skipped\n", n);      } else {	num[n-1] = entry_num;	if (maxn < n) maxn = n;      }    }  }  return(maxn);}/**  * Read word/class entry names and 1-gram data from LR 2-gram file. *  * @param fp [in] file pointer * @param ndata [out] N-gram to set the read data. */static booleanset_unigram(FILE *fp, NGRAM_INFO *ndata){  WORD_ID nid;  int resid;  LOGPROB prob, bo_wt;  char *name, *p;  boolean ok_p = TRUE;  NGRAM_TUPLE_INFO *t;  t = &(ndata->d[0]);  /* malloc name area */  ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num);  /* malloc data area */  //t->bgn_upper = t->bgn_lower = t->bgn = t->num = NULL;  t->bgn_upper = NULL;  t->bgn_lower = NULL;  t->bgn = NULL;  t->num = NULL;  t->bgnlistlen = 0;  t->nnid2wid = NULL;  t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);  t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);  t->context_num = t->totalnum;  t->nnid2ctid_upper = NULL;  t->nnid2ctid_lower = NULL;  nid = 0;    while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {    if ((p = strtok(buf, DELM)) == NULL) {      jlog("Error: ngram_read_arpa: 1-gram: failed to parse, corrupted or invalid data?\n");      return FALSE;    }    prob = (LOGPROB)atof(p);    if ((p = strtok(NULL, DELM)) == NULL) {      jlog("Error: ngram_read_arpa: 1-gram: failed to parse, corrupted or invalid data?\n");      return FALSE;    }    name = strcpy((char *)mymalloc(strlen(p)+1), p);    if ((p = strtok(NULL, DELM)) == NULL) {      bo_wt = 0.0;    } else {      bo_wt = (LOGPROB)atof(p);    }    /* register word entry name */    ndata->wname[nid] = name;    /* add entry name to index tree */    if (ndata->root == NULL) {      ndata->root = ptree_make_root_node(nid);    } else {      resid = ptree_search_data(name, ndata->root);      if (resid != -1 && strmatch(name, ndata->wname[resid])) { /* already exist */	jlog("Error: ngram_read_arpa: duplicate word entry \"%s\" at #%d and #%d in 1-gram\n", name, resid, nid);	ok_p = FALSE;	continue;      } else {	ptree_add_entry(name, nid, ndata->wname[resid], &(ndata->root));      }    }    if (nid >= ndata->max_word_num) {      jlog("Error: ngram_read_arpa: num of 1-gram is bigger than header value (%d)\n", ndata->max_word_num);      return FALSE;    }    /* register entry info */    t->prob[nid] = prob;    t->bo_wt[nid] = bo_wt;      nid++;  }  if (nid != t->totalnum) {    jlog("Error: ngram_read_arpa: num of 1-gram (%d) not equal to header value (%d)\n", nid, t->totalnum);    return FALSE;  }  if (ok_p == TRUE) {    jlog("Stat: ngram_read_arpa: read %d 1-gram entries\n", nid);  }    return ok_p;}/* read-in 1-gram (RL) --- only add back-off weight *//**  * Read 1-gram data from RL 3-gram file.  Only the back-off weights are * stored. *  * @param fp [in] file pointer * @param ndata [out] N-gram to store the read data. */static booleanadd_unigram(FILE *fp, NGRAM_INFO *ndata){  WORD_ID read_word_num;  WORD_ID nid;  LOGPROB prob, bo_wt;  char *name, *p;  boolean ok_p = TRUE;  boolean mismatched = FALSE;  ndata->bo_wt_1 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->max_word_num);  read_word_num = 0;  while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {    if ((p = strtok(buf, DELM)) == NULL) {      jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n");      return FALSE;    }    prob = atof(p);    if ((p = strtok(NULL, DELM)) == NULL) {      jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n");      return FALSE;    }    name = strcpy((char *)mymalloc(strlen(p)+1), p);    if ((p = strtok(NULL, DELM)) == NULL) {      bo_wt = 0.0;    } else {      bo_wt = (LOGPROB)atof(p);    }    /* add bo_wt_rl to existing 1-gram entry */    nid = ngram_lookup_word(ndata, name);    if (nid == WORD_INVALID) {      if (mismatched == FALSE) {	jlog("Error: ngram_read_arpa: vocabulary mismatch between LR n-gram and RL n-gram\n");	mismatched = TRUE;      }      jlog("Error: ngram_read_arpa: \"%s\" does not appears in LR n-gram\n", name);      ok_p = FALSE;    } else {      ndata->bo_wt_1[nid] = bo_wt;    }      read_word_num++;    if (read_word_num > ndata->max_word_num) {      jlog("Error: ngram_read_arpa: vocabulary size of RL n-gram is bigger than header value (%d)\n", ndata->max_word_num);      return FALSE;    }    free(name);  }  if (ok_p == TRUE) {    jlog("Stat: ngram_read_arpa: read %d 1-gram entries\n", read_word_num);  }  return ok_p;}/**  * Read forward 2-gram data and set the LR 2-gram probabilities to the * already loaded RL N-gram. *  * @param fp [in] file pointer * @param ndata [i/o] N-gram to set the read data. */static booleanadd_bigram(FILE *fp, NGRAM_INFO *ndata){  WORD_ID w[2], wtmp;  LOGPROB prob;  NNID bi_count = 0;  NNID n2;  boolean ok_p = TRUE;  char *s;  ndata->p_2 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->d[1].totalnum);  while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {    strcpy(pbuf, buf);    if ( ++bi_count % 100000 == 0) {      jlog("Stat: ngram_read_arpa: 2-gram read %lu (%d%%)\n", bi_count, bi_count * 100 / ndata->d[1].totalnum);    }    if ((s = strtok(buf, DELM)) == NULL) {      jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");      return FALSE;    }    prob = (LOGPROB)atof(s);    if ((s = strtok(NULL, DELM)) == NULL) {      jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");      return FALSE;    }    w[0] = ngram_lookup_word(ndata, s);    if (w[0] == WORD_INVALID) {      jlog("Error: ngram_read_arpa: 2-gram #%lu: \"%s\": \"%s\" not exist in 1-gram\n", bi_count, pbuf, s);      ok_p = FALSE;      continue;    }    if ((s = strtok(NULL, DELM)) == NULL) {      jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");      return FALSE;    }    w[1] = ngram_lookup_word(ndata, s);    if (w[1] == WORD_INVALID) {      jlog("Error: ngram_read_arpa: 2-gram #%lu: \"%s\": \"%s\" not exist in 1-gram\n", bi_count, pbuf, s);      ok_p = FALSE;      continue;    }    if (ndata->dir == DIR_RL) {      /* word order should be reversed */      wtmp = w[0];      w[0] = w[1];      w[1] = wtmp;    }    n2 = search_ngram(ndata, 2, w);    if (n2 == NNID_INVALID) {      jlog("Warning: ngram_read_arpa: 2-gram #%d: \"%s\": (%s,%s) not exist in LR 2-gram (ignored)\n", n2+1, pbuf, ndata->wname[w[0]], ndata->wname[w[1]]);    } else {      ndata->p_2[n2] = prob;    }  }  if (ok_p == TRUE) {    jlog("Stat: ngram_read_arpa: 2-gram read %lu end\n", bi_count);  }  return ok_p;}    /**  * Read n-gram data for a given N from ARPA n-gram file. (n >= 2) *  * @param fp [in] file pointer * @param ndata [out] N-gram to set the read data. */static booleanset_ngram(FILE *fp, NGRAM_INFO *ndata, int n){  NNID i;  WORD_ID w[MAX_N];  WORD_ID w_last[MAX_N];  LOGPROB p, bowt;  NNID nnid;  NNID cid, cid_last;  boolean ok_p = TRUE;  char *s;  NGRAM_TUPLE_INFO *t;  NGRAM_TUPLE_INFO *tprev;  NNID ntmp;  if (n < 2) {    jlog("Error: ngram_read_arpa: unable to process 1-gram\n");    return FALSE;  }  t = &(ndata->d[n-1]);  tprev = &(ndata->d[n-2]);  /* initialize pointer storage to access from (N-1)-gram */  t->bgnlistlen = tprev->context_num;  if (t->is24bit) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -