confnet.c

来自「julius version 4.12.about sound recognit」· C语言 代码 · 共 890 行 · 第 1/2 页

C
890
字号
  overlap = (PROB)overlap_frame / (PROB)sum_len;#ifdef CDEBUG2  printf("[%d..%d] [%d..%d]  overlap = %d / %d = %f",	 w1->lefttime, w1->righttime, w2->lefttime, w2->righttime,	 overlap_frame, sum_len, overlap);#endif#ifdef PREFER_GRAPH_CM#ifdef CDEBUG2  printf("  cm=%f, %f", w1->graph_cm, w2->graph_cm);#endif  sim = overlap * w1->graph_cm * w2->graph_cm;#else #ifdef CDEBUG2  printf("  cm=%f, %f", w1->cmscore, w2->cmscore);#endif  sim = overlap * w1->cmscore * w2->cmscore;#endif#ifdef CDEBUG2  printf("  similarity=%f\n", sim);#endif  return sim;}/**  * Compute intra-word similarity of two clusters. *  * @param c1 [in] cluster 1 * @param c2 [in] cluster 2 * @param winfo [in] word dictionary *  * @return the maximum similarity. */static PROBget_cluster_intraword_similarity(CN_CLUSTER *c1, CN_CLUSTER *c2, WORD_INFO *winfo){  int i1, i2;  PROB simmax, sim;  simmax = 0.0;  for(i1 = 0; i1 < c1->wgnum; i1++) {    for(i2 = 0; i2 < c2->wgnum; i2++) {      if (is_same_word(c1->wg[i1]->wid, c2->wg[i2]->wid, winfo)) {	//if (graph_ordered(c1->wg[i1]->id, c2->wg[i2]->id)) continue;	sim = get_intraword_similarity(c1->wg[i1], c2->wg[i2]);	if (simmax < sim) simmax = sim;      }    }  }  return(simmax);}#ifdef CDEBUG/**  * Output a cluster information. * * @param fp [in] file pointer to output * @param c [in] cluster to output * @param winfo [in] word dictionary */static voidput_cluster(FILE *fp, CN_CLUSTER *c, WORD_INFO *winfo){  int i;  for(i=0;i<c->wgnum;i++) {    fprintf(fp, "[%d:%s:%d..%d]", c->wg[i]->id, winfo->woutput[c->wg[i]->wid], c->wg[i]->lefttime, c->wg[i]->righttime);  }  printf("\n");}#endif/**  * Return minimum value of the three arguments. *  * @param a [in] value 1 * @param b [in] value 2 * @param c [in] value 3 *  * @return the minumum value. */static intminimum(int a, int b, int c){  int min;  min = a;  if (b < min)    min = b;  if (c < min)    min = c;  return min;}/**  * Calculate Levenstein distance (edit distance) of two words. *  * @param w1 [in] word ID 1 * @param w2 [in] word ID 2 * @param winfo [in] word dictionary *  * @return the distance. */static intedit_distance(WORD_ID w1, WORD_ID w2, WORD_INFO *winfo, char *b1, char *b2){  int i1, i2;  int *d;  int len1, len2;  int j;  int cost;  int distance;  len1 = winfo->wlen[w1] + 1;  len2 = winfo->wlen[w2] + 1;  d = (int *)mymalloc(sizeof(int) * len1 * len2);  for(j=0;j<len1;j++) d[j] = j;  for(j=0;j<len2;j++) d[j*len1] = j;  for(i1=1;i1<len1;i1++) {    center_name(winfo->wseq[w1][i1-1]->name, b1);    for(i2=1;i2<len2;i2++) {      center_name(winfo->wseq[w2][i2-1]->name, b2);      if (strmatch(b1, b2)) {	cost = 0;      } else {	cost = 1;      }      d[i2 * len1 + i1] = minimum(d[(i2-1) * len1 + i1] + 1, d[i2 * len1 + (i1-1)] + 1, d[(i2-1) * len1 + (i1-1)] + cost);    }  }  distance = d[len1 * len2 - 1];  free(d);  return(distance);}/**  * Compute inter-word similarity of two clusters. *  * @param c1 [in] cluster 1 * @param c2 [in] cluster 2 * @param winfo [in] word dictionary *  * @return the average similarity. */static PROBget_cluster_interword_similarity(RecogProcess *r, CN_CLUSTER *c1, CN_CLUSTER *c2, WORD_INFO *winfo, char *buf1, char *buf2){  int i1, i2, j;  WORD_ID w1, w2;  PROB p1, p2;  PROB sim, simsum;  int simsum_count;  int dist;  /* order check */  for(i1 = 0; i1 < c1->wgnum; i1++) {    for(i2 = 0; i2 < c2->wgnum; i2++) {      if (graph_ordered(r, c1->wg[i1]->id, c2->wg[i2]->id)) {	/* ordered clusters should not be merged */	//printf("Ordered:\n");	//printf("c1:\n"); put_cluster(stdout, c1, winfo);	//printf("c2:\n"); put_cluster(stdout, c2, winfo);	return 0.0;      }    }  }#ifdef CDEBUG2  printf("-----\n");  printf("c1:\n"); put_cluster(stdout, c1, winfo);  printf("c2:\n"); put_cluster(stdout, c2, winfo);#endif  /* compute similarity */  simsum = 0.0;  simsum_count = 0;  for(i1 = 0; i1 < c1->wordsnum; i1++) {    w1 = c1->words[i1];    p1 = 0.0;    for(j = 0; j < c1->wgnum; j++) {      if (is_same_word(c1->wg[j]->wid, w1, winfo)) {#ifdef PREFER_GRAPH_CM	p1 += c1->wg[j]->graph_cm;#else	p1 += c1->wg[j]->cmscore;#endif      }    }    for(i2 = 0; i2 < c2->wordsnum; i2++) {      w2 = c2->words[i2];      p2 = 0.0;      for(j = 0; j < c2->wgnum; j++) {	if (is_same_word(c2->wg[j]->wid, w2, winfo)) {#ifdef PREFER_GRAPH_CM	  p2 += c2->wg[j]->graph_cm;#else	  p2 += c2->wg[j]->cmscore;#endif	}      }      dist = edit_distance(w1, w2, winfo, buf1, buf2);#ifdef CDEBUG2      for(j=0;j<winfo->wlen[w1];j++) {	printf("%s ", winfo->wseq[w1][j]->name);      }      printf("\n");      for(j=0;j<winfo->wlen[w2];j++) {	printf("%s ", winfo->wseq[w2][j]->name);      }      printf("\n");      printf("distance=%d\n", dist);#endif      sim = 1.0 - (float)dist / (float)(winfo->wlen[w1] + winfo->wlen[w2]);#ifdef CDEBUG2      printf("(%s) - (%s): sim = %f, p1 = %f, p2 = %f\n", winfo->woutput[w1], winfo->woutput[w2], sim, p1, p2);#endif      simsum += sim * p1 * p2;      simsum_count++;    }  }#ifdef CDEBUG2  printf("SIM=%f\n", simsum / simsum_count);  printf("-----\n");#endif  return(simsum / simsum_count);}/**  * @brief  Create a confusion network from word graph. * * @param root [in] root pointer of word graph * @param r [in] recognition process instance * * @return root pointer to the cluster list. * * @callgraph * @callergraph *  */CN_CLUSTER *confnet_create(WordGraph *root, RecogProcess *r){  CN_CLUSTER *croot;  CN_CLUSTER *c, *cc, *cmax1, *cmax2;  WordGraph *wg;  PROB sim, max_sim;  int wg_totalnum, n, i;  char *buf1, *buf2;  buf1 = (char *)mymalloc(MAX_HMMNAME_LEN);  buf2 = (char *)mymalloc(MAX_HMMNAME_LEN);  /* make initial confnet instances from word graph */  croot = NULL;  wg_totalnum = 0;  for(wg=root;wg;wg=wg->next) {    c = cn_new();    cn_add_wg(c, wg);    c->next = croot;    croot = c;    wg_totalnum++;  }  /* intraword clustering iteration */  do {    /* find most similar pair */    max_sim = 0.0;    for(c=croot;c;c=c->next) {      for(cc=c->next;cc;cc=cc->next) {	sim = get_cluster_intraword_similarity(c, cc, r->lm->winfo);	if (max_sim < sim) {	  max_sim = sim;	  cmax1 = c;	  cmax2 = cc;	}      }    }    /* merge the maximum one if exist */    if (max_sim != 0.0) {#ifdef CDEBUG      printf(">>> max_sim = %f\n", max_sim);      put_cluster(stdout, cmax1, r->lm->winfo);      put_cluster(stdout, cmax2, r->lm->winfo);#endif      cn_merge(r, cmax1, cmax2);      cn_destroy(cmax2, &croot);    }  } while (max_sim != 0.0); /* loop until no more similar pair exists */  n = 0;  for(c=croot;c;c=c->next) n++;  if (verbose_flag) jlog("STAT: confnet: %d words -> %d clusters by intra-word clustering\n", wg_totalnum, n);#ifdef CDEBUG  printf("---- result of intra-word clustering ---\n");  i = 0;  for(c=croot;c;c=c->next) {    printf("%d :", i);    put_cluster(stdout, c, r->lm->winfo);#ifdef CDEBUG2    for(i=0;i<c->wgnum;i++) {      printf("    ");      put_wordgraph(stdout, c->wg[i], r->lm->winfo);    }#endif    i++;  }  printf("----------------------------\n");#endif  /* inter-word clustering */  do {    /* build word list for each cluster */    for(c=croot;c;c=c->next) cn_build_wordlist(c, r->lm->winfo);    /* find most similar pair */    max_sim = 0.0;    for(c=croot;c;c=c->next) {      for(cc=c->next;cc;cc=cc->next) {	sim = get_cluster_interword_similarity(r, c, cc, r->lm->winfo, buf1, buf2);	if (max_sim < sim) {	  max_sim = sim;	  cmax1 = c;	  cmax2 = cc;	}      }    }    /* merge the maximum one if exist */    if (max_sim != 0.0) {#ifdef CDEBUG      printf(">>> max_sim = %f\n", max_sim);      put_cluster(stdout, cmax1, r->lm->winfo);      put_cluster(stdout, cmax2, r->lm->winfo);#endif      cn_merge(r, cmax1, cmax2);      cn_destroy(cmax2, &croot);    }  } while (max_sim != 0.0); /* loop until no more similar pair exists */  n = 0;  for(c=croot;c;c=c->next) n++;  if (verbose_flag) jlog("STAT: confnet: -> %d clusters by inter-word clustering\n", n);  /* compute posterior probabilities and insert NULL entry */  {    PROB p, psum;    int j;    for(c=croot;c;c=c->next) {      psum = 0.0;      c->pp = (LOGPROB *)mymalloc(sizeof(LOGPROB) * (c->wordsnum + 1));      for(i=0;i<c->wordsnum;i++) {	p = 0.0;	for(j = 0; j < c->wgnum; j++) {	  if (is_same_word(c->wg[j]->wid, c->words[i], r->lm->winfo)) {#ifdef PREFER_GRAPH_CM	    p += c->wg[j]->graph_cm;#else	    p += c->wg[j]->cmscore;#endif	  }	}	c->pp[i] = p;	psum += p;      }      if (psum < 1.0) {	c->words[c->wordsnum] = WORD_INVALID;	c->pp[c->wordsnum] = 1.0 - psum;	c->wordsnum++;      }    }  }  /* sort the words in each cluster by their posterior probabilities */  {    int j;    WORD_ID wtmp;    LOGPROB ltmp;    for(c=croot;c;c=c->next) {      for(i=0;i<c->wordsnum;i++) {	for(j=c->wordsnum - 1;j>i;j--) {	  if (c->pp[j-1] < c->pp[j]) {	    ltmp = c->pp[j-1];	    c->pp[j-1] = c->pp[j];	    c->pp[j] = ltmp;	    wtmp = c->words[j-1];	    c->words[j-1] = c->words[j];	    c->words[j] = wtmp;	  }	}      }    }  }  /* re-order clusters by their beginning frames */  {    CN_CLUSTER **clist;    int k;    /* sort cluster list by the left frame*/    clist = (CN_CLUSTER **)mymalloc(sizeof(CN_CLUSTER *) * n);    for(i=0,c=croot;c;c=c->next) {      clist[i++] = c;    }    qsort_reentrant(clist, n, sizeof(CN_CLUSTER *), (int (*)(const void *, const void *, void *))compare_cluster, r);    croot = NULL;    for(k=0;k<n;k++) {      if (k == 0) croot = clist[k];      if (k == n - 1) clist[k]->next = NULL;      else clist[k]->next = clist[k+1];    }    free(clist);  }#if 0  /* output */  printf("---- begin confusion network ---\n");  for(c=croot;c;c=c->next) {    for(i=0;i<c->wordsnum;i++) {      printf("(%s:%.3f)", (c->words[i] == WORD_INVALID) ? "-" : r->lm->winfo->woutput[c->words[i]], c->pp[i]);      if (i == 0) printf("  ");    }    printf("\n");  }  printf("---- end confusion network ---\n");#endif  free(buf2);  free(buf1);  return(croot);}/* end of file */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?