📄 ngram_read_arpa.c

📁 about sound recognition.i want to downlod
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
      j_printerr("  2-gram read %d (%d%%)\n", bi_count, bi_count * 100 / ndata->ngram_num[1]);    }    prob = (LOGPROB)atof(first_token(buf));    w_r = lookup_word(ndata, next_token());    w_l = lookup_word(ndata, next_token());    bo_wt = (LOGPROB)atof(next_token());    n2 = search_bigram(ndata, w_l, w_r);    if (n2 == NNID_INVALID) {      j_printerr("Warning: (%s,%s) not exist in LR 2-gram (ignored)\n",	      ndata->wname[w_l], ndata->wname[w_r]);    } else {      ndata->p_rl[n2] = prob;      ndata->bo_wt_rrl[n2] = bo_wt;    }  }  j_printerr("  2-gram read %d end\n", bi_count);  }    /**  * Read reverse 3-gram data from RL 3-gram file and store them. *  * @param fp [in] file pointer * @param ndata [i/o] N-gram to set the read data. */static voidset_trigram(FILE *fp, NGRAM_INFO *ndata){  int w_l, w_m, w_r;  LOGPROB p_rl;  int w_r_last, w_m_last, w_l_last;  NNID n2, n2_last;  NNID n3;  NNID ntmp;  /* allocate pointer from 2gram to 3gram */  switch(ndata->version) {  case 3:    ndata->n3_bgn = (NNID *)mymalloc(sizeof(NNID)*ndata->ngram_num[1]);    for(n2=0;n2<ndata->ngram_num[1];n2++) ndata->n3_bgn[n2] = NNID_INVALID;    break;  case 4:    ndata->n3_bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER)*ndata->ngram_num[1]);    ndata->n3_bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER)*ndata->ngram_num[1]);    for(n2=0;n2<ndata->ngram_num[1];n2++) {          ndata->n3_bgn_upper[n2] = NNID_INVALID_UPPER;      ndata->n3_bgn_lower[n2] = 0;    }    break;  }  ndata->n3_num = (WORD_ID *)mymalloc(sizeof(WORD_ID)*ndata->ngram_num[1]);  for(n2=0;n2<ndata->ngram_num[1];n2++) ndata->n3_num[n2] = 0;  /* allocate data area for 3-gram */  ndata->n3tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID)*ndata->ngram_num[2]);  ndata->p_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[2]);  n3 = 0;  n2 = n2_last = NNID_INVALID;  w_r_last = w_m_last = w_l_last = -1;  for (;;) {    if (getl(buf, sizeof(buf), fp) == NULL || buf[0] == '\\') break;    strcpy(pbuf, buf);    if (n3 % 100000 == 0) {      j_printerr("  3-gram read %d (%d%%)\n", n3, n3 * 100 / ndata->ngram_num[2]);    }    /* N-gram probability */    p_rl = (LOGPROB)atof(first_token(buf));    /* read in right (first) word and lookup its ID */    w_r = lookup_word(ndata, next_token());    /* read in middle word and lookup its ID */    w_m = lookup_word(ndata, next_token());    /* if context changed, create the next structure */    if (w_r != w_r_last || w_m != w_m_last) {      n2 = search_bigram(ndata, (WORD_ID)w_m, (WORD_ID)w_r);      if (n2 == NNID_INVALID) {	/* no context */        j_printerr("Warning: context (%s,%s) not exist in LR 2-gram (ignored)\n",		   ndata->wname[w_m], ndata->wname[w_r]);        continue;      }      switch(ndata->version) {      case 3:	ntmp = ndata->n3_bgn[n2_last];	break;      case 4:	ntmp = ((NNID)(ndata->n3_bgn_upper[n2_last]) << 16) + (NNID)(ndata->n3_bgn_lower[n2_last]);	break;      }      if (n2_last != NNID_INVALID) ndata->n3_num[n2_last] = n3 - ntmp;      /* check: the next 'n2' should be an new entry */      switch(ndata->version) {      case 3:	if (ndata->n3_bgn[n2] != NNID_INVALID) {	  j_printerr("Error: entry not sorted (same left context not sequenced)\n");	  j_error("at 3-gram #%d: \"%s\"\n", n3+1, pbuf);	}	ndata->n3_bgn[n2] = n3;	break;      case 4:	if (ndata->n3_bgn_upper[n2] != NNID_INVALID_UPPER) {	  j_printerr("Error: entry not sorted (same left context not sequenced)\n");	  j_error("at 3-gram #%d: \"%s\"\n", n3+1, pbuf);	}	ntmp = n3 & 0xffff;	ndata->n3_bgn_lower[n2] = ntmp;	ntmp = n3 >> 16;	ndata->n3_bgn_upper[n2] = ntmp;	break;      }      n2_last = n2;      w_l_last = -1;    } else {      if (n2 == NNID_INVALID) continue;    }        /* read in left (last) word and store */    w_l = lookup_word(ndata, next_token());    if (w_l == w_l_last) {      j_printerr("Error: duplicated entry\n");      j_error("at 3-gram #%d: \"%s\"\n", n3+1, pbuf);    } else if (w_l < w_l_last) {      j_printerr("Error: entry not sorted downward\n");      j_error("at 3-gram #%d: \"%s\"\n", n3+1, pbuf);    }    ndata->n3tonid[n3] = w_l;    ndata->p_rrl[n3] = p_rl;    n3++;    w_m_last = w_m;    w_r_last = w_r;    w_l_last = w_l;    /* check the 3-gram num */    if (n3 > ndata->ngram_num[2]) {      j_printerr("Error: actual 3-gram num not match the header value\n");      j_error("%d != %d ?\n", n3, ndata->ngram_num[2]);    }  }  /* store the last n3_num */  switch(ndata->version) {  case 3:    ntmp = ndata->n3_bgn[n2_last];    break;  case 4:    ntmp = ((NNID)(ndata->n3_bgn_upper[n2_last]) << 16) + (NNID)(ndata->n3_bgn_lower[n2_last]);    break;  }  ndata->n3_num[n2_last] = n3 - ntmp;  j_printerr("  3-gram read %d end\n", n3);}static boolean LR_2gram_read = FALSE; ///< TRUE if LR 2gram has already been read/**  * Read in one ARPA N-gram file, either LR 2-gram or RL 3-gram. *  * @param fp [in] file pointer * @param ndata [out] N-gram data to store the read data * @param direction [in] specify whether this is LR 2-gram or RL 3-gram *  * @return TRUE on success, FALSE on failure. */booleanngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, int direction){  int n;  ndata->from_bin = FALSE;  if (!LR_2gram_read && direction == DIR_RL) {    j_printerr("you should first read LR 2-gram\n");    return FALSE;  }  if (direction == DIR_LR) {    n = 2;  } else {    n = 3;  }  /* read until `\data\' found */  while (getl(buf, sizeof(buf), fp) != NULL && strncmp(buf,"\\data\\",6) != 0);      /* read n-gram total info */  if (direction == DIR_LR) {    set_total_info(fp, ndata);  } else {    set_and_check_total_info(fp, ndata);  }  if (ndata->ngram_num[0] > MAX_WORD_NUM) {    j_error("Error: vocabulary size exceeded limit (%d)\n", MAX_WORD_NUM);  }  ndata->max_word_num = ndata->ngram_num[0];  /* version requirement check (determined by 3-gram entry limit) */  if (n >= 3) {    if (ndata->ngram_num[2] >= NNIDMAX) {      j_printerr("Warning: more than %d 3-gram tuples, use old structure\n", NNIDMAX);      ndata->version = 3;    } else {      ndata->version = 4;    }  }    /* read 1-gram data */  if (!strnmatch(buf,"\\1-grams",8)) {    j_error("data format error: 1-gram not found\n");  }  j_printerr("  reading 1-gram part...\n");  if (direction == DIR_LR) {    set_unigram(fp, ndata);  } else {    add_unigram(fp, ndata);  }    if (n >= 2) {    /* read 2-gram data */    if (!strnmatch(buf,"\\2-grams", 8)) {      j_error("data format error: 2-gram not found\n");    }    j_printerr("  reading 2-gram part...\n");    if (direction == DIR_LR) {      set_bigram(fp, ndata);    } else {      add_bigram_rl(fp, ndata);    }  }  if (n >= 3) {      /* read 3-gram data */    if (!strnmatch(buf,"\\3-grams", 8)) {      j_error("data format error: 3-gram not found\n");    }    if ( direction == DIR_LR) {      j_error("should not happen..\n");    } else {      j_printerr("  reading 3-gram part...\n");      set_trigram(fp, ndata);    }  }  /* finished */  if (!strnmatch(buf, "\\end", 4)) {    j_error("data format error: data end marker \"\\end\" not found\n");  }#ifdef CLASS_NGRAM  /* skip in-class word entries (they should be in word dictionary) */  if (getl(buf, sizeof(buf), fp) != NULL) {    if (strnmatch(buf, "\\class", 6)) {      j_printerr("  skipping in-class word entries...\n");    }  }#endif  if (n >= 3 && ndata->version == 4) {    /* compact the 2-gram back-off and 3-gram links */    ngram_compact_bigram_context(ndata);  }    /* set unknown (=OOV) word id */  set_unknown_id(ndata);  if (direction == DIR_LR) {    LR_2gram_read = TRUE;  }  return TRUE;}/**  * Compact the 2-gram context information. *  * @param ndata [i/o] N-gram data */voidngram_compact_bigram_context(NGRAM_INFO *ndata){  NNID i;  int c;  int dst;  NNID ntmp;  /* version check */  if (ndata->version != 4) {    j_error("InternalError: bigram context compaction called for version != 4\n");  }  /* count number of valid bigram context */  c = 0;  for(i=0;i<ndata->ngram_num[1];i++) {    if (ndata->n3_bgn_upper[i] != NNID_INVALID_UPPER) {      c++;    } else {      if (ndata->n3_num[i] != 0) {	printf("bgn=%d|%d, num=%d, bo_wt_rrl=%f\n",	       ndata->n3_bgn_upper[i], 	       ndata->n3_bgn_lower[i], 	       ndata->n3_num[i],	       ndata->bo_wt_rrl[i]);	j_error("Error: ngram_compact_bigram_context: internal error\n");      }      if (ndata->bo_wt_rrl[i] != 0.0) {	j_error("Error: 2-gram has no upper 3-gram, but not 0.0 back-off weight\n");      }    }  }  ndata->bigram_bo_num = c;  j_printerr("num: %d -> %d\n", ndata->ngram_num[1], ndata->bigram_bo_num);    /* allocate index buffer */  ndata->n2bo_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->ngram_num[1]);  ndata->n2bo_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->ngram_num[1]);  /* make index and do compaction of context informations */  dst = 0;  for(i=0;i<ndata->ngram_num[1];i++) {    if (ndata->n3_bgn_upper[i] != NNID_INVALID_UPPER) {      ndata->bo_wt_rrl[dst] = ndata->bo_wt_rrl[i];      ndata->n3_bgn_upper[dst] = ndata->n3_bgn_upper[i];      ndata->n3_bgn_lower[dst] = ndata->n3_bgn_lower[i];      ndata->n3_num[dst] = ndata->n3_num[i];      ntmp = dst & 0xffff;      ndata->n2bo_lower[i] = ntmp;      ntmp = dst >> 16;      ndata->n2bo_upper[i] = ntmp;      dst++;    } else {      ndata->n2bo_upper[i] = NNID_INVALID_UPPER;      ndata->n2bo_lower[i] = 0;    }  }  /* really shrink the memory area */  ndata->bo_wt_rrl = (LOGPROB *)myrealloc(ndata->bo_wt_rrl, sizeof(LOGPROB) * ndata->bigram_bo_num);  ndata->n3_bgn_upper = (NNID_UPPER *)myrealloc(ndata->n3_bgn_upper, sizeof(NNID_UPPER) * ndata->bigram_bo_num);  ndata->n3_bgn_lower = (NNID_LOWER *)myrealloc(ndata->n3_bgn_lower, sizeof(NNID_LOWER) * ndata->bigram_bo_num);  ndata->n3_num = (WORD_ID *)myrealloc(ndata->n3_num, sizeof(WORD_ID) * ndata->bigram_bo_num);}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -