📄 wchmm.c
字号:
out_num_prev = out_num_next; } /* end of phone loop */ } /* end of multipath block */ } /* new phone node creation loop for this word */ /*************************************/ /* Short Pause appending (multipath) */ /*************************************/ /* if -iwsp, add noise model to the end of word at ntmp */ if (wchmm->hmminfo->multipath && enable_iwsp && add_tail - add_head + 1 > 0) { /* there are new phones to be created */ int ntmp_bak; /* set short pause state info */ ntmp_bak = ntmp; if (wchmm->hmminfo->sp->is_pseudo) { for(k = 1;k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) { wchmm->outstyle[ntmp] = AS_LSET; wchmm->state[ntmp].out.lset = &(wchmm->hmminfo->sp->body.pseudo->stateset[k]); acc_init(wchmm, ntmp); wchmm->stend[ntmp] = WORD_INVALID; ntmp++; if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm); } } else { for(k = 1;k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) { wchmm->outstyle[ntmp] = AS_STATE; wchmm->state[ntmp].out.state = wchmm->hmminfo->sp->body.defined->s[k]; acc_init(wchmm, ntmp); wchmm->stend[ntmp] = WORD_INVALID; ntmp++; if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm); } } ntmp = ntmp_bak; /* connect incoming arcs from previous phone */ out_num_next = 0; for (ato = 1; ato < hmm_logical_state_num(wchmm->hmminfo->sp); ato++) { prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[0][ato]; if (prob != LOG_ZERO) { /* to control short pause insertion, transition probability toward the word-end short pause will be given a penalty */ prob += wchmm->hmminfo->iwsp_penalty; if (ato == hmm_logical_state_num(wchmm->hmminfo->sp) - 1) { /* model has a model skip transition, just inherit them to next */ for(kkk=0; kkk<out_num_prev; kkk++) { out_from_next[out_num_next] = out_from[kkk]; out_a_next[out_num_next] = out_a[kkk] + prob; out_num_next++; } } else { /* connect incoming arcs from previous phone to this phone */ for(kkk=0; kkk<out_num_prev; kkk++) { add_wacc(wchmm, out_from[kkk], out_a[kkk] + prob, ntmp + ato - 1); } } } } /* if short pause model doesn't have a model skip transition, also add it */ if (hmm_logical_trans(wchmm->hmminfo->sp)->a[0][hmm_logical_state_num(wchmm->hmminfo->sp)-1] == LOG_ZERO) { /* to make insertion sp model to have no effect on the original path, the skip transition probability should be 0.0 (=100%) */ prob = 0.0; for(kkk=0; kkk<out_num_prev; kkk++) { out_from_next[out_num_next] = out_from[kkk]; out_a_next[out_num_next] = out_a[kkk] + prob; out_num_next++; } } /* connect arcs within model, and store new outgoing arcs for wordend node */ for (k = 1; k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) { for (ato = 1; ato < hmm_logical_state_num(wchmm->hmminfo->sp); ato++) { prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[k][ato]; if (prob != LOG_ZERO) { if (ato == hmm_logical_state_num(wchmm->hmminfo->sp) - 1) { out_from_next[out_num_next] = ntmp; out_a_next[out_num_next] = prob; out_num_next++; } else { add_wacc(wchmm, ntmp, prob, ntmp + ato - k); } } } ntmp++; } /* swap work area for next */ for(kkk=0;kkk<out_num_next;kkk++) { out_from[kkk] = out_from_next[kkk]; out_a[kkk] = out_a_next[kkk]; } out_num_prev = out_num_next; } /* end of inter-word short pause appending block */ /* make mapping: word <-> node on wchmm */ for (j=0;j<word_len;j++) { if (j < add_head) { /* shared part */ wchmm->offset[word][j] = wchmm->offset[matchword][j]; } else if (add_tail < j) { /* shared tail part (should not happen..) */ wchmm->offset[word][j] = wchmm->offset[matchword][j+(matchword_len-word_len)]; } else { /* newly created part */ wchmm->offset[word][j] = n; n += hmm_logical_state_num(wchmm->winfo->wseq[word][j]) - 2; } } if (wchmm->hmminfo->multipath) { /* create word-end node */ /* paranoia check if the short-pause addition has been done well */ if (enable_iwsp && add_tail - add_head + 1 > 0) { n += hmm_logical_state_num(wchmm->hmminfo->sp) - 2; if (n != ntmp) j_internal_error("wchmm_add_word: cannot match\n"); } /* create word-end node */ wchmm->wordend[word] = n; /* tail node of 'word' is 'n' */ wchmm->stend[n] = word; /* node 'k' is a tail node of 'word' */ acc_init(wchmm, n); wchmm->state[n].out.state = NULL; /* connect the final outgoing arcs in out_from[] to the word end node */ for(k = 0; k < out_num_prev; k++) { add_wacc(wchmm, out_from[k], out_a[k], n); } n++; if (n >= wchmm->maxwcn) wchmm_extend(wchmm); if (matchlen == 0) { /* check if the new word has whole word-skipping transition */ /* (use out_from and out_num_prev temporary) */ out_num_prev = 0; get_outtrans_list(wchmm, word, word_len-1, out_from, out_a, &out_num_prev, wchmm->winfo->maxwn, enable_iwsp); for(k=0;k<out_num_prev;k++) { if (out_from[k] == wchmm->wordbegin[word]) { jlog("ERROR: *** ERROR: WORD SKIPPING TRANSITION NOT ALLOWED ***\n"); jlog("ERROR: Word id=%d (%s[%s]) has \"word skipping transition\".\n", word, wchmm->winfo->wname[word], wchmm->winfo->woutput[word]); jlog("ERROR: All HMMs in the word:\n "); for(kkk=0;kkk<wchmm->winfo->wlen[word];kkk++) { jlog("%s ", wchmm->winfo->wseq[word][kkk]->name); } jlog("\n"); jlog("ERROR: has transitions from initial state to final state.\n"); jlog("ERROR: This type of word skipping is not supported.\n"); ok_p = FALSE; } } } wchmm->n = n; } else { wchmm->n = n; k = wchmm->offset[word][word_len-1] + hmm_logical_state_num(wchmm->winfo->wseq[word][word_len-1])-2 -1; wchmm->wordend[word] = k; /* tail node of 'word' is 'k' */ wchmm->stend[k] = word; /* node 'k' is a tail node of 'word' */ if (matchlen != 0 && add_tail - add_head + 1 > 0) { /* new part has been created in the above procedure: */ /* now make link from shared part to the new part */ wchmm_link_subword(wchmm, matchword,add_to,word,add_head); } } return(ok_p); }/*************************************************************//**** parse whole structure (after wchmm has been built) *****//*************************************************************//** * <JA> * 腾菇陇步辑今を瘤汉し·帽胳の姜眉觉轮から嘲への肌莲败澄唯のリストを侯喇する. * (non multipath) * * @param wchmm [i/o] 腾菇陇步辑今 * </JA> * <EN> * Scan the lexicon tree to make list of emission probability from the word end * state. (non multipath) * * @param wchmm [i/o] tree lexicon * </EN> */static voidwchmm_calc_wordend_arc(WCHMM_INFO *wchmm){ WORD_ID w; HTK_HMM_Trans *tr; LOGPROB a; for (w=0;w<wchmm->winfo->num;w++) { tr = hmm_logical_trans(wchmm->winfo->wseq[w][wchmm->winfo->wlen[w]-1]); a = tr->a[tr->statenum-2][tr->statenum-1]; wchmm->wordend_a[w] = a; }}#ifdef SEPARATE_BY_UNIGRAM/********************************************************************//****** for separation (linearization) of high-frequent words *******//********************************************************************//** * <JA> * unigram澄唯でソ〖トするための qsort コ〖ルバック簇眶. * * @param a [in] 妥燎1 * @param b [in] 妥燎2 * * @return 遍换の冯蔡の射圭を手す. * </JA> * <EN> * qsort callback function to sort unigram values. * * @param a [in] element #1 * @param b [in] element #2 * * @return the result of comparison. * </EN> */static intcompare_prob(LOGPROB *a, LOGPROB *b){ if (*a < *b) return (1); if (*a > *b) return (-1); return(0);}/** * <JA> * 1-gramスコアの惧疤 N 戎誊の猛を滇める. * * @param winfo [in] 帽胳辑今 * @param n [in] 滇める界疤 * * @return 惧疤 N 戎誊の uni-gram 澄唯の猛を手す. * </JA> * <EN> * Get the Nth-best unigram probability from all words. * * @param winfo [in] word dictionary * @param n [in] required rank * * @return the Nth-best unigram probability. * </EN> */static LOGPROBget_nbest_uniprob(WCHMM_INFO *wchmm, int n){ LOGPROB *u_p; WORD_ID w; LOGPROB x; WORD_INFO *winfo; NGRAM_INFO *ngram; winfo = wchmm->winfo; ngram = wchmm->ngram; if (n < 1) n = 1; if (n > winfo->num) n = winfo->num; /* store all unigram probability to u_p[] */ u_p = (LOGPROB *)mymalloc(sizeof(LOGPROB) * winfo->num); for(w=0;w<winfo->num;w++) { if (ngram) { x = uni_prob(ngram, winfo->wton[w])#ifdef CLASS_NGRAM + winfo->cprob[w]#endif ; } else { x = LOG_ZERO; } if (wchmm->lmvar == LM_NGRAM_USER) { x = (*(wchmm->uni_prob_user))(wchmm->winfo, w, x); } u_p[w] = x; } /* sort them downward */ qsort(u_p, winfo->num, sizeof(LOGPROB), (int (*)(const void *,const void *))compare_prob); /* return the Nth value */ x = u_p[n-1]; free(u_p); return(x);}#endif/**********************************************************//****** MAKE WCHMM (LEXICON TREE) --- main function *******//**********************************************************/#define COUNT_STEP 500 ///< Word count step for debug progress output/** * <JA> * 涂えられた帽胳辑今と咐胳モデルから腾菇陇步辑今を菇蜜する. この簇眶は * 借妄が觅く·Julianで"-oldtree"オプション回年箕のみ蝗脱されます. オプション * 润回年箕およびJuliusでは洛わりに build_wchmm2() が脱いられます. * * @param wchmm [i/o] 腾菇陇步辑今 * @param lmconf [in] 咐胳モデル(LM)肋年パラメ〖タ * </JA> * <EN> * Build a tree lexicon from given word dictionary and language model. * This function is slow and only used when "-oldtree" option is specified * in Julian. Julian without that option and Julius uses build_wchmm2() * instead of this. * * @param wchmm [i/o] lexicon tree * @param lmconf [in] language model (LM) configuration parameters * </EN> * @callgraph * @callergraph */booleanbuild_wchmm(WCHMM_INFO *wchmm, JCONF_LM *lmconf){ int i,j; int matchword=0, sharelen=0, maxsharelen=0; int num_duplicated;#ifdef SEPARATE_BY_UNIGRAM LOGPROB separate_thres; LOGPROB p;#endif boolean ok_p; /* lingustic infos must be set before build_wchmm() is called */ /* check if necessary lingustic info is already assigned (for debug) */ if (wchmm->winfo == NULL || (wchmm->lmvar == LM_NGRAM && wchmm->ngram == NULL) || (wchmm->lmvar == LM_DFA_GRAMMAR && wchmm->dfa == NULL) ) { jlog("ERROR: wchmm: linguistic info not available!!\n"); return FALSE; } ok_p = TRUE; #ifdef SEPARATE_BY_UNIGRAM /* 惧疤[separate_wnum]戎誊の1-gramスコアを滇める */ /* 1-gramスコアがこの猛笆惧のものは腾から尸ける */ separate_thres = get_nbest_uniprob(wchmm, lmconf->separate_wnum);#endif#ifdef PASS1_IWCD#ifndef USE_OLD_IWCD if (wchmm->category_tree) { if (wchmm->ccd_flag) { /* 链てのカテゴリID烧き lcd_set を侯喇 */ lcdset_register_with_category_all(wchmm); } }#endif#endif /* PASS1_IWCD */ /* wchmmを介袋步 */ wchmm_init(wchmm); /* カウンタリセット */ wchmm->separated_word_count=0; jlog("STAT: wchmm: Building HMM lexicon tree (left-to-right)\n"); for (i=0;i<wchmm->winfo->num;i++) { if (wchmm->lmtype == LM_PROB) { if (i == wchmm->winfo->head_silwid || i == wchmm->winfo->tail_silwid) { /* 黎片/琐萨の痰不モデルは腾菇陇步せず· * 黎片の痰不帽胳の黎片への莲败·琐萨帽胳の琐萨からの莲败は侯らない*/ /* sharelen=0でそのまま */ if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) { jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); ok_p = FALSE; } continue; }#ifndef NO_SEPARATE_SHORT_WORD if (wchmm->winfo->wlen[i] <= SHORT_WORD_LEN) { /* 墓さの没い帽胳を腾菇陇步しない(ここでは1不泪) */ /* sharelen=0でそのまま */ if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) { jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); ok_p = FALSE; } wchmm->separated_word_count++; continue; }#endif#ifdef SEPARATE_BY_UNIGRAM if (wchmm->ngram) { p = uni_prob(wchmm->ngram, wchmm->winfo->wton[i])#ifdef CLASS_NGRAM + wchmm->winfo->cprob[i]#endif ;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -