📄 ngram_read_arpa.c

📁 about sound recognition.i want to downlod
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/** * @file   ngram_read_arpa.c * @author Akinobu LEE * @date   Wed Feb 16 16:52:24 2005 *  * <JA> * @brief  ARPA妨及のN-gramファイルを粕み哈む * * ARPA妨及のN-gramファイルを脱いる眷圭·2-gram と嫡羹き 3-gram を * それぞれ侍」のファイルから粕み哈みますˉ * </JA> *  * <EN> * @brief  Read ARPA format N-gram files * * When N-gram data is given in ARPA format, both 2-gram file and * reverse 3-gram file should be specified. * </EN> * * @sa ngram2.h *  * $Revision: 1.6 $ *  *//* * Copyright (c) 1991-2006 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2006 Julius project team, Nagoya Institute of Technology * All rights reserved *//* $Id: ngram_read_arpa.c,v 1.6 2006/11/10 02:27:43 sumomo Exp $ *//* words should be alphabetically sorted */#include <sent/stddefs.h>#include <sent/ngram2.h>static char buf[800];			///< Local buffer for readingstatic char pbuf[800];			///< Local buffer for error string /**  * Get N-gram word/class id of a string, and terminate program if not found. *  * @param ndata [in] N-gram data * @param str [in] name string of N-gram entry *  * @return the entry ID. */static WORD_IDlookup_word(NGRAM_INFO *ndata, char *str){  WORD_ID wid;    if ((wid = ngram_lookup_word(ndata, str)) == WORD_INVALID) {    j_error("word %s not in N-gram vocabulary.\n",str);  }  return wid;}/**  * @brief  Set unknown word ID to the N-gram data. * * In CMU-Cam SLM toolkit, OOV words are always mapped to UNK, which * always appear at the very beginning of N-gram entry, so we fix the * unknown word ID at "0". *  * @param ndata [out] N-gram data to set unknown word ID. */voidset_unknown_id(NGRAM_INFO *ndata){#if 0  ndata->unk_id = ngram_lookup_word(ndata, unkword);  if (ndata->unk_id == WORD_INVALID) {    j_printerr("word %s not found, so assume this is a closed vocabulary model\n",	    unkword);    ndata->isopen = FALSE;  } else {    ndata->isopen = TRUE;  }#endif  ndata->isopen = TRUE;  ndata->unk_id = 0;		/* unknown (OOV) words are always mapped to				   the number 0 (by CMU-TK)*/}/**  * Set number of N-gram entries, for reading the first LR 2-gram. *  * @param fp [in] file pointer * @param ndata [out] N-gram data to set it. */static voidset_total_info(FILE *fp, NGRAM_INFO *ndata){  char *p;  int n;  while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {    if (strnmatch(buf, "ngram", 5)) { /* n-gram num */      p = strtok(buf, "=");      n = p[strlen(p)-1] - '0' - 1;      p = strtok(NULL, "=");      ndata->ngram_num[n] = atoi(p);    }  }}/* read total info and check it with LR data (RL) *//**  * Read number of N-gram entries of the second RL 3-gram, and * check if those values are exactly the same as the previous LR values. *  * @param fp [in] file pointer * @param ndata [i/o] N-gram data  */static voidset_and_check_total_info(FILE *fp, NGRAM_INFO *ndata){  char *p;  int n;  while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {    if (strnmatch(buf, "ngram", 5)) { /* n-gram num */      p = strtok(buf, "=");      n = p[strlen(p)-1] - '0' - 1;      p = strtok(NULL, "=");/*  *	 if (n <= 2 && ndata->ngram_num[n] != atoi(p)) { *	   j_printerr("LR and RL don't match at ngram_num!\n"); *	   j_error("cut-off value when building LM differ?\n"); *	 } */      if (n == 2) {		/* 3-gram */	ndata->ngram_num[n] = atoi(p);      } else {	if (n <= 1 && ndata->ngram_num[n] != atoi(p)) {	  j_printerr("Warning: %d-gram total num differ! may cause read error\n",n+1);	}      }    }  }}/**  * Read word/class entry names and 1-gram data from LR 2-gram file. *  * @param fp [in] file pointer * @param ndata [out] N-gram to set the read data. */static voidset_unigram(FILE *fp, NGRAM_INFO *ndata){  WORD_ID read_word_num;	/* # of words already read */  WORD_ID nid, resid;  LOGPROB prob, bo_wt;  char *name, *p;  /* malloc area */  ndata->wname = (char **)mymalloc(sizeof(char *)*ndata->ngram_num[0]);  ndata->p = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[0]);  ndata->bo_wt_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[0]);  ndata->bo_wt_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[0]);  ndata->n2_bgn = (NNID *)mymalloc(sizeof(NNID)*ndata->ngram_num[0]);  ndata->n2_num = (WORD_ID *)mymalloc(sizeof(WORD_ID)*ndata->ngram_num[0]);  read_word_num = 0;    while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {    prob = (LOGPROB)atof(first_token(buf));    p = next_token();    name = strcpy((char *)mymalloc(strlen(p)+1), p);    bo_wt = (LOGPROB)atof(next_token());    /* register unigram */    nid = read_word_num;    ndata->wname[nid] = name;    /* add entry name to index tree */    if (ndata->root == NULL) {      ndata->root = ptree_make_root_node(nid);    } else {      resid = ptree_search_data(name, ndata->root);      if (strmatch(name, ndata->wname[resid])) { /* already exist */	j_error("Error: word \"%s\" multiply defined at (#%d and #%d)\n",		   name, resid, nid);      } else {	ptree_add_entry(name, nid, ndata->wname[resid], &(ndata->root));      }    }    ndata->p[nid] = prob;    ndata->bo_wt_lr[nid] = bo_wt;    ndata->n2_bgn[nid] = NNID_INVALID;    ndata->n2_num[nid] = 0;      read_word_num++;    if (read_word_num > ndata->max_word_num) {      j_printerr("Error: actual n-gram word num exceeded header value\n");      j_error("%d > %d\n", read_word_num, ndata->max_word_num);    }  }  if (read_word_num != ndata->ngram_num[0]) {    j_printerr("Error: actual n-gram word num not match the header value\n");    j_error("%d != %d ?\n", read_word_num, ndata->ngram_num[0]);  }  j_printerr("  1-gram read %d end\n", read_word_num);}/* read-in 1-gram (RL) --- only add back-off weight *//**  * Read 1-gram data from RL 3-gram file.  Only the back-off weights are * stored. *  * @param fp [in] file pointer * @param ndata [out] N-gram to store the read data. */static voidadd_unigram(FILE *fp, NGRAM_INFO *ndata){  WORD_ID read_word_num;  WORD_ID nid;  LOGPROB prob, bo_wt;  char *name, *p;  read_word_num = 0;  while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {    prob = atof(first_token(buf));    p = next_token();    name = strcpy((char *)mymalloc(strlen(p)+1), p);    bo_wt = (LOGPROB)atof(next_token());      /* add bo_wt_rl to existing 1-gram entry */    nid = lookup_word(ndata, name);    if (nid == WORD_INVALID) {      j_printerr("Warning: n-gram word \"%s\" in RL not exist in LR (ignored)\n", name);    } else {      ndata->bo_wt_rl[nid] = bo_wt;    }      read_word_num++;    if (read_word_num > ndata->max_word_num) {      j_printerr("Error: actual n-gram word num exceeded header value\n");      j_error("%d > %d\n", read_word_num, ndata->max_word_num);    }    free(name);  }  j_printerr("  1-gram read %d end\n", read_word_num);  }/**  * Read 2-gram data from LR 2-gram file. *  * @param fp [in] file pointer * @param ndata [out] N-gram to set the read data. */static voidset_bigram(FILE *fp, NGRAM_INFO *ndata){  int w_l, w_r;  int w_last, w_r_last;  LOGPROB p;  NNID n2;  ndata->n2tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID)*ndata->ngram_num[1]);  ndata->p_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[1]);  ndata->p_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[1]);  ndata->bo_wt_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[1]);  n2 = 0;    /* read in LR 2-gram */  w_last = -1; w_r_last = -1;  for (;;) {    if (getl(buf, sizeof(buf), fp) == NULL || buf[0] == '\\') break;    strcpy(pbuf, buf);    if ( n2 % 100000 == 0) {      j_printerr("  2-gram read %d (%d%%)\n", n2, n2 * 100 / ndata->ngram_num[1]);    }    /* 2-gram probability */    p = (LOGPROB)atof(first_token(buf));    /* read in left (context) word and lookup the ID */    w_l = lookup_word(ndata, next_token());    /* increment n2_bgn and n2_num if context word changed */    if (w_l != w_last) {      if (w_last != -1) ndata->n2_num[w_last] = n2 - ndata->n2_bgn[w_last];      /* the next context word should be an new entry */      if (ndata->n2_bgn[w_l] != NNID_INVALID) {	j_printerr("Error: entry not sorted (same left context not sequenced)\n");	j_error("at 2-gram #%d: \"%s\"\n", n2+1, pbuf);      }      ndata->n2_bgn[w_l] = n2;      w_r_last = -1;    }    /* read in right word and set */    w_r = lookup_word(ndata, next_token());    if (w_r == w_r_last) {      j_printerr("Error: duplicated entry\n");      j_error("at 2-gram #%d: \"%s\"\n", n2+1, pbuf);    } else if (w_r < w_r_last) {      j_printerr("Error: entry not sorted downward\n");      j_error("at 2-gram #%d: \"%s\"\n", n2+1, pbuf);    }    ndata->n2tonid[n2] = w_r;    ndata->p_lr[n2] = p;    n2++;    w_last = w_l;    w_r_last = w_r;    /* check total num */    if (n2 > ndata->ngram_num[1]) {      j_printerr("Error: actual 2-gram num not match the header value\n");      j_error("%d != %d ?\n", n2, ndata->ngram_num[1]);    }  }    /* set the last entry */  ndata->n2_num[w_last] = n2 - ndata->n2_bgn[w_last];  j_printerr("  2-gram read %d end\n", n2);}/**  * Read reverse 2-gram data from RL 3-gram file, and set RL 2-gram * probabilities and back-off values for RL 3-gram to the corresponding * LR 2-gram data. *  * @param fp [in] file pointer * @param ndata [i/o] N-gram to set the read data. */static voidadd_bigram_rl(FILE *fp, NGRAM_INFO *ndata){  WORD_ID w_l, w_r;  LOGPROB prob, bo_wt;  int bi_count = 0;  NNID n2;  while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {    /* p(w_l|w_r) w_r w_l bo_wt_rl */    if ( ++bi_count % 100000 == 0) {
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -