📄 wchmm.c
字号:
} else { p = LOG_ZERO; } if (wchmm->lmvar == LM_NGRAM_USER) { p = (*(wchmm->uni_prob_user))(wchmm->winfo, i, p); } if (p >= separate_thres && wchmm->separated_word_count < lmconf->separate_wnum) { /* 裳刨の光い帽胳を腾菇陇步しない */ /* separate_thres は惧疤separate_wnum戎誊のスコア */ if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) { jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); ok_p = FALSE; } wchmm->separated_word_count++; continue; }#endif } /* 呵も墓く不燎を鼎铜叫丸る帽胳を玫す */ maxsharelen=0; for (j=0;j<i;j++) { if (wchmm->category_tree && wchmm->lmtype == LM_DFA) { if (wchmm->winfo->wton[i] != wchmm->winfo->wton[j]) continue; } sharelen = wchmm_check_match(wchmm->winfo, i, j); if (sharelen == wchmm->winfo->wlen[i] && sharelen == wchmm->winfo->wlen[j]) { /* word に票不胳が赂哼する */ /* 涩ず呵络の墓さであり·脚剩カウントを闰けるためここで却ける */ maxsharelen = sharelen; matchword = j; break; } if (sharelen > maxsharelen) { matchword = j; maxsharelen = sharelen; } } if (wchmm_add_word(wchmm, i, maxsharelen, matchword, lmconf->enable_iwsp) == FALSE) { jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); ok_p = FALSE; } }#if 0 /* 腾菇陇を侯らない */ for (i=0;i<wchmm->winfo->num;i++) { if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) { jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); ok_p = FALSE; } }#endif jlog("STAT: %5d words ended (%6d nodes)\n",i,wchmm->n); if (! wchmm->hmminfo->multipath) { /* 票办不燎废误を积つ帽胳票晃の leaf node を2脚步して惰侍する */ num_duplicated = wchmm_duplicate_leafnode(wchmm); jlog("STAT: %d leaf nodes are made unshared\n", num_duplicated); /* 帽胳の姜眉から嘲への莲败澄唯を滇めておく */ wchmm_calc_wordend_arc(wchmm); } /* wchmmの腊圭拉をチェックする */ check_wchmm(wchmm); /* factoring脱に称觉轮に稿鲁帽胳のリストを烧裁する */ if (!wchmm->category_tree) {#ifdef UNIGRAM_FACTORING if (wchmm->lmtype == LM_PROB) { /* 票箕に涟もってfactoring猛を纷换 */ make_successor_list_unigram_factoring(wchmm); jlog("STAT: 1-gram factoring values has been pre-computed\n"); } else { make_successor_list(wchmm); }#else make_successor_list(wchmm);#endif /* UNIGRAM_FACTORING */ if (wchmm->hmminfo->multipath) { /* 菇蜜された factoring 攫鼠をスキップ莲败および矢片矢恕ノ〖ドにコピ〖 */ adjust_sc_index(wchmm); } #ifdef UNIGRAM_FACTORING if (wchmm->lmtype == LM_PROB) { /* 帽胳粗LMキャッシュが涩妥なノ〖ドのリストを侯る */ make_iwcache_index(wchmm); }#endif /* UNIGRAM_FACTORING */ /* sclist2node is no longer used */ if (wchmm->sclist2node != NULL) { free(wchmm->sclist2node); wchmm->sclist2node = NULL; } } jlog("STAT: done\n"); return ok_p;}/** * <JA> * 涂えられた帽胳辑今と咐胳モデルから腾菇陇步辑今を菇蜜する. * この簇眶は bulid_wchmm() と票じ借妄を乖いますが· * 呵介に帽胳を不燎误でソ〖トして不燎误の击た界に帽胳を事べるため· * より光庐に腾菇陇步を乖うことができる. とくにオプション回年をしない * 嘎り·Julius/Julianではこちらが脱いられる. * * @param wchmm [i/o] 腾菇陇步辑今 * @param lmconf [in] 咐胳モデル(LM)肋年パラメ〖タ * </JA> * <EN> * Build a tree lexicon from given word dictionary and language model. * This function does the same job as build_wchmm(), but it is much * faster because finding of the longest matched word to an adding word * is done by first sorting all the words in the dictoinary by their phoneme * sequence order. This function will be used instead of build_wchmm() * by default. * * @param wchmm [i/o] lexicon tree * @param lmconf [in] language model (LM) configuration parameters * </EN> * @callgraph * @callergraph */ booleanbuild_wchmm2(WCHMM_INFO *wchmm, JCONF_LM *lmconf){ int i,j, last_i; int num_duplicated; WORD_ID *windex;#ifdef SEPARATE_BY_UNIGRAM LOGPROB separate_thres; LOGPROB p;#endif boolean ok_p; boolean ret; /* lingustic infos must be set before build_wchmm() is called */ /* check if necessary lingustic info is already assigned (for debug) */ if (wchmm->winfo == NULL || (wchmm->lmvar == LM_NGRAM && wchmm->ngram == NULL) || (wchmm->lmvar == LM_DFA_GRAMMAR && wchmm->dfa == NULL) ) { jlog("ERROR: wchmm: linguistic info not available!!\n"); return FALSE; } ok_p = TRUE; wchmm->separated_word_count = 0; jlog("STAT: Building HMM lexicon tree\n"); if (wchmm->lmtype == LM_PROB) {#ifdef SEPARATE_BY_UNIGRAM /* compute score threshold beforehand to separate words from tree */ /* here we will separate best [separate_wnum] words from tree */ separate_thres = get_nbest_uniprob(wchmm, lmconf->separate_wnum);#endif }#ifdef PASS1_IWCD#ifndef USE_OLD_IWCD if (wchmm->category_tree) { if (wchmm->ccd_flag) { /* when Julian mode (category-tree) and triphone is used, make all category-indexed context-dependent phone set (cdset) here */ /* these will be assigned on the last phone of each word on tree */ lcdset_register_with_category_all(wchmm); } }#endif#endif /* PASS1_IWCD */ /* initialize wchmm */ wchmm_init(wchmm); /* make sorted word index ordered by phone sequence */ windex = (WORD_ID *)mymalloc(sizeof(WORD_ID) * wchmm->winfo->num); for(i=0;i<wchmm->winfo->num;i++) windex[i] = i; if (wchmm->category_tree && wchmm->lmtype == LM_DFA) { /* sort by category -> sort by word ID in each category */ wchmm_sort_idx_by_category(wchmm->winfo, windex, wchmm->winfo->num); { int last_cate; last_i = 0; last_cate = wchmm->winfo->wton[windex[0]]; for(i = 1;i<wchmm->winfo->num;i++) { if (wchmm->winfo->wton[windex[i]] != last_cate) { wchmm_sort_idx_by_wseq(wchmm->winfo, windex, last_i, i - last_i); last_cate = wchmm->winfo->wton[windex[i]]; last_i = i; } } wchmm_sort_idx_by_wseq(wchmm->winfo, windex, last_i, wchmm->winfo->num - last_i); } } else { /* sort by word ID for whole vocabulary */ wchmm_sort_idx_by_wseq(wchmm->winfo, windex, 0, wchmm->winfo->num); }/* * { * int i,w; * for(i=0;i<wchmm->winfo->num;i++) { * w = windex[i]; * printf("%d: cate=%4d wid=%4d %s\n",i, wchmm->winfo->wton[w], w, wchmm->winfo->woutput[w]); * } * } */ /* incrementaly add words to lexicon tree */ /* now for each word, the previous word (last_i) is always the most matched one */ last_i = WORD_INVALID; for (j=0;j<wchmm->winfo->num;j++) { i = windex[j]; if (wchmm->lmtype == LM_PROB) { /* start/end silence word should not be shared */ if (i == wchmm->winfo->head_silwid || i == wchmm->winfo->tail_silwid) { /* add whole word as new (sharelen=0) */ if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) { jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); ok_p = FALSE; } continue; }#ifndef NO_SEPARATE_SHORT_WORD /* separate short words from tree */ if (wchmm->winfo->wlen[i] <= SHORT_WORD_LEN) { if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) { jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); ok_p = FALSE; } wchmm->separated_word_count++; continue; }#endif#ifdef SEPARATE_BY_UNIGRAM if (wchmm->ngram) { p = uni_prob(wchmm->ngram, wchmm->winfo->wton[i])#ifdef CLASS_NGRAM + wchmm->winfo->cprob[i]#endif ; } else { p = LOG_ZERO; } if (wchmm->lmvar == LM_NGRAM_USER) { p = (*(wchmm->uni_prob_user))(wchmm->winfo, i, p); } /* separate high-frequent words from tree (threshold = separate_thres) */ if (p >= separate_thres && wchmm->separated_word_count < lmconf->separate_wnum) { if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) { jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); ok_p = FALSE; } wchmm->separated_word_count++; continue; }#endif } if (last_i == WORD_INVALID) { /* first word */ ret = wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp); } else { /* the previous word (last_i) is always the most matched one */ if (wchmm->category_tree && wchmm->lmtype == LM_DFA) { if (wchmm->winfo->wton[i] != wchmm->winfo->wton[last_i]) { ret = wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp); } else { ret = wchmm_add_word(wchmm, i, wchmm_check_match(wchmm->winfo, i, last_i), last_i, lmconf->enable_iwsp); } } else { ret = wchmm_add_word(wchmm, i, wchmm_check_match(wchmm->winfo, i, last_i), last_i, lmconf->enable_iwsp); } } if (ret == FALSE) { jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); ok_p = FALSE; } last_i = i; } /* end of add word loop */ /*j_printerr("\r %5d words ended (%6d nodes)\n",j,wchmm->n);*/ /* free work area */ free(windex); if (wchmm->hmminfo->multipath) { jlog("STAT: lexicon size: %d nodes\n", wchmm->n); } else { /* duplicate leaf nodes of homophone/embedded words */ jlog("STAT: lexicon size: %d", wchmm->n); num_duplicated = wchmm_duplicate_leafnode(wchmm); jlog("+%d=%d\n", num_duplicated, wchmm->n); } if (! wchmm->hmminfo->multipath) { /* calculate transition probability of word end node to outside */ wchmm_calc_wordend_arc(wchmm); } /* check wchmm coherence (internal debug) */ check_wchmm(wchmm); /* make successor list for all branch nodes for N-gram factoring */ if (!wchmm->category_tree) {#ifdef UNIGRAM_FACTORING if (wchmm->lmtype == LM_PROB) { /* for 1-gram factoring, we can compute the values before search */ make_successor_list_unigram_factoring(wchmm); jlog("STAT: 1-gram factoring values has been pre-computed\n"); } else { make_successor_list(wchmm); }#else make_successor_list(wchmm);#endif /* UNIGRAM_FACTORING */ if (wchmm->hmminfo->multipath) { /* Copy the factoring data according to the skip transitions and startword nodes */ adjust_sc_index(wchmm); }#ifdef UNIGRAM_FACTORING if (wchmm->lmtype == LM_PROB) { /* make list of start nodes that needs inter-word LM cache */ make_iwcache_index(wchmm); }#endif /* UNIGRAM_FACTORING */ /* sclist2node is no longer used */ if (wchmm->sclist2node != NULL) { free(wchmm->sclist2node); wchmm->sclist2node = NULL; } } //jlog("STAT: done\n");#ifdef WCHMM_SIZE_CHECK if (debug2_flag) { /* detailed check of lexicon tree size (inaccurate!) */ jlog("STAT: --- memory size of word lexicon ---\n"); jlog("STAT: wchmm: %d words, %d nodes\n", wchmm->winfo->num, wchmm->n); jlog("STAT: %9d bytes: wchmm->state[node] (exclude ac, sc)\n", sizeof(WCHMM_STATE) * wchmm->n); { int count1 = 0; int count2 = 0; int count3 = 0; for(i=0;i<wchmm->n;i++) { if (wchmm->self_a[i] != LOG_ZERO) count1++; if (wchmm->next_a[i] != LOG_ZERO) count2++; if (wchmm->ac[i] != NULL) count3++; } jlog("STAT: %9d bytes: wchmm->self_a[node] (%4.1f%% filled)\n", sizeof(LOGPROB) * wchmm->n, 100.0 * count1 / (float)wchmm->n); jlog("STAT: %9d bytes: wchmm->next_a[node] (%4.1f%% filled)\n", sizeof(LOGPROB) * wchmm->n, 100.0 * count2 / (float)wchmm->n); jlog("STAT: %9d bytes: wchmm->ac[node] (%4.1f%% used)\n", sizeof(A_CELL2 *) * wchmm->n, 100.0 * count3 / (float)wchmm->n); } jlog("STAT: %9d bytes: wchmm->stend[node]\n", sizeof(WORD_ID) * wchmm->n); { int w,count; count = 0; for(w=0;w<wchmm->winfo->num;w++) { count += wchmm->winfo->wlen[w] * sizeof(int) + sizeof(int *); } jlog("STAT: %9d bytes: wchmm->offset[w][]\n", count); } if (wchmm->hmminfo->multipath) { jlog("STAT: %9d bytes: wchmm->wordbegin[w]\n", wchmm->winfo->num * sizeof(int)); } jlog("STAT: %9d bytes: wchmm->wordend[w]\n", wchmm->winfo->num * sizeof(int)); jlog("STAT: %9d bytes: wchmm->startnode[]\n", wchmm->startnum * sizeof(int)); if (wchmm->category_tree) { jlog("STAT: %9d bytes: wchmm->start2wid[]\n", wchmm->startnum * sizeof(WORD_ID)); }#ifdef UNIGRAM_FACTORING if (wchmm->lmtype == LM_PROB) { jlog("STAT: %9d bytes: wchmm->start2isolate[]\n", wchmm->isolatenum * sizeof(int)); }#endif if (!wchmm->hmminfo->multipath) { jlog("STAT: %9d bytes: wchmm->wordend_a[]\n", wchmm->winfo->num * sizeof(LOGPROB)); }#ifdef PASS1_IWCD jlog("STAT: %9d bytes: wchmm->outstyle[]\n", wchmm->n * sizeof(unsigned char)); {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -