📄 factoring_sub.c
字号:
* On 1-gram factoring, the shared nodes on branch has fixed factoring score * from 1-gram values, independent of the word context on recognition. So * the values are fixed for all recognition and can be calculated before * search. This function stores all the neede 1-gram factoring value by * traversing tree lexicon with successor lists and compute maximum 1-gram * for each successor lists that has more than two words (=shared). * Since a successor list is no more neede after the 1-gram value is computed, * they will be freed. * * Actually, computed factoring scores will be stored in wchmm->fscore * sequencially, and the index value, starting from 1, * to the fscore list is stored in scid of each nodes as a negative value. * The free will be performed in compaction_successor() by checking if a * successor's corresponding scid on tree lexicon has negative value. * * @param wchmm [i/o] tree lexicon * </EN> * * @callgraph * @callergraph * */voidcalc_all_unigram_factoring_values(WCHMM_INFO *wchmm){ S_CELL *sc, *sctmp; LOGPROB tmpprob, maxprob; int i, n; /* count needed number of 1-gram factoring nodes */ n = 0; for (i=1;i<wchmm->scnum;i++) { sc = wchmm->sclist[i]; if (sc == NULL) { j_internal_error("call_all_unigram_factoring_values: sclist has no sc?\n"); } if (sc->next != NULL) { /* more than two words, so compute maximum 1-gram probability */ n++; } } wchmm->fsnum = n + 1; /* allocate area */ wchmm->fscore = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->fsnum); /* assign values */ n = 1; for (i=1;i<wchmm->scnum;i++) { sc = wchmm->sclist[i]; if (sc->next != NULL) { maxprob = LOG_ZERO; for (sctmp = sc; sctmp; sctmp = sctmp->next) { if (wchmm->ngram) { tmpprob = uni_prob(wchmm->ngram, wchmm->winfo->wton[sctmp->word])#ifdef CLASS_NGRAM + wchmm->winfo->cprob[sctmp->word] #endif ; } else { tmpprob = LOG_ZERO; } if (wchmm->lmvar == LM_NGRAM_USER) { tmpprob = (*(wchmm->uni_prob_user))(wchmm->winfo, sctmp->word, tmpprob); } if (maxprob < tmpprob) maxprob = tmpprob; } wchmm->fscore[n] = maxprob; free_successor(wchmm, i); wchmm->state[wchmm->sclist2node[i]].scid = - n; n++; } } /* garbage collection of factored sclist */ compaction_successor(wchmm);}#else /* ~UNIGRAM_FACTORING *//** * <JA> * 腾菇陇步辑今惧のあるノ〖ドについて·涂えられた帽胳旺悟に滦する2-gram * スコアを纷换する. * * @param wchmm [in] 腾菇陇步辑今 * @param lastword [in] 木涟帽胳 * @param node [in] @a wchmm 惧のノ〖ド戎规 * * @return 2-gram 澄唯. * </JA> * <EN> * Compute 2-gram factoring value for the node and return the probability. * * @param wchmm [in] tree lexicon * @param lastword [in] the last context word * @param node [in] node ID on @a wchmm * * @return the log probability of 2-gram on that node. * </EN> * */static LOGPROBcalc_successor_prob(WCHMM_INFO *wchmm, WORD_ID lastword, int node){ S_CELL *sc; LOGPROB tmpprob, maxprob; WORD_ID lw; maxprob = LOG_ZERO; if (wchmm->ngram) { lw = wchmm->winfo->wton[lastword]; } for (sc = wchmm->sclist[wchmm->state[node].scid]; sc; sc = sc->next) { if (wchmm->ngram) { tmpprob = (*(wchmm->ngram->bigram_prob))(wchmm->ngram, lw , wchmm->winfo->wton[sc->word])#ifdef CLASS_NGRAM + wchmm->winfo->cprob[sc->word]#endif ; } else { tmpprob = LOG_ZERO; } if (wchmm->lmvar == LM_NGRAM_USER) { tmpprob = (*(wchmm->bi_prob_user))(wchmm->winfo, lastword, sc->word, tmpprob); } if (maxprob < tmpprob) maxprob = tmpprob; } return(maxprob);}#endif /* ~UNIGRAM_FACTORING *//** * <JA> * @brief 帽胳柒のあるノ〖ドについて factoring 猛を纷换する. * * 1-gram factoring で盖年factoring猛がある眷圭はその猛が篓郝に手される. * 戮の眷圭は·そのノ〖ドのサブツリ〖柒の帽胳の 2-gram澄唯∈の呵络猛∷が * 纷换される. * * 帽胳柒 factoring キャッシュが雇胃される. すなわち称ノ〖ドについて * 木涟帽胳が涟搀アクセスされたときと票じであれば· * 涟搀の猛が手され·そうでなければ猛を纷换し·キャッシュが构糠される. * * @param wchmm [in] 腾菇陇步辑今 * @param lastword [in] 木涟帽胳のID * @param node [in] ノ〖ド戎规 * * @return 咐胳モデルスコア * </JA> * <EN> * @brief compute factoring LM score for the given word-internal node. * * If it is a shared branch node and 1-gram factoring is used, the * constant factoring value which has already been assigned before search * will be returned immediately. Else, the maximum 2-gram probability * of corresponding successor words are computed. * * The word-internal factoring cache is consulted within this function. * If the given last word is the same as the last call on that node, * the last computed value will be returned, else the maximum value * will be computed update the cache with the last word and value. * * @param wchmm [in] tree lexicon * @param lastword [in] word ID of last context word * @param node [in] node ID * * @return the LM factoring score. * </EN> * * @callgraph * @callergraph * */LOGPROBmax_successor_prob(WCHMM_INFO *wchmm, WORD_ID lastword, int node){ LOGPROB maxprob; WORD_ID last_nword, w; int scid; LM_PROB_CACHE *l; l = &(wchmm->lmcache); if (lastword != WORD_INVALID) { /* return nothing if no previous word */ if (wchmm->ngram) { last_nword = wchmm->winfo->wton[lastword]; } else { last_nword = lastword; } scid = wchmm->state[node].scid;#ifdef UNIGRAM_FACTORING if (scid < 0) { /* return 1-gram factoring value already calced */ return(wchmm->fscore[(- scid)]); } else { /* this node has only one successor */ /* return precise 2-gram score */ if (last_nword != l->lastwcache[scid]) { /* calc and cache */ w = (wchmm->sclist[scid])->word; if (wchmm->ngram) { maxprob = (*(wchmm->ngram->bigram_prob))(wchmm->ngram, last_nword, wchmm->winfo->wton[w])#ifdef CLASS_NGRAM + wchmm->winfo->cprob[w]#endif ; } else { maxprob = LOG_ZERO; } if (wchmm->lmvar == LM_NGRAM_USER) { maxprob = (*(wchmm->bi_prob_user))(wchmm->winfo, lastword, w, maxprob); } l->lastwcache[scid] = last_nword; l->probcache[scid] = maxprob; return(maxprob); } else { /* return cached */ return (l->probcache[scid]); } }#else /* UNIGRAM_FACTORING */ /* 2-gram */ if (last_nword != l->lastwcache[scid]) { maxprob = calc_successor_prob(wchmm, lastword, node); /* store to cache */ l->lastwcache[scid] = last_nword; l->probcache[scid] = maxprob; return(maxprob); } else { return (l->probcache[scid]); }#endif /* UNIGRAM_FACTORING */ } else { return(0.0);#if 0 maxprob = LOG_ZERO; for (sc=wchmm->state[node].sc;sc;sc=sc->next) { tmpprob = uni_prob(wchmm->ngram, sc->word); if (maxprob < tmpprob) maxprob = tmpprob; } return(maxprob);#endif }}/** * <JA> * @brief 帽胳粗の factoring 猛のリストを手す. * * 涂えられた木涟帽胳に滦して·factoring猛を纷换すべき链ての帽胳黎片への * factoring 猛を纷换し·そのリストを手す. このfactoring猛は * 木涟帽胳ごとにリスト帽疤でキャッシュされる. すなわち·その木涟帽胳が * それまでに办刨でも木涟帽胳として叫附していた眷圭·そのリストをそのまま * 手す. * * @param wchmm [in] 腾菇陇步辑今 * @param lastword [in] 木涟帽胳 * * @return 链帽胳黎片ノ〖ドへの factoring スコアのリスト * </JA> * <EN> * @brief Compute cross-word facgtoring values for word head nodes and return * the list. * * Given a last word, this function compute the factoring LM scores for all * the word head node to which the context-dependent (not 1-gram) factoring * values should be computed. The resulting list of factoring values are * cached within this function per the last word. * * @param wchmm [in] tree lexicon * @param lastword [in] last word * * @return the list of factoring LM scores for all the needed word-head nodes. * </EN> * * @callgraph * @callergraph * */LOGPROB *max_successor_prob_iw(WCHMM_INFO *wchmm, WORD_ID lastword){ int i, j, x, node; int last_nword; WORD_ID w; LM_PROB_CACHE *l; LOGPROB p; l = &(wchmm->lmcache); if (wchmm->ngram) { last_nword = wchmm->winfo->wton[lastword]; } else { last_nword = lastword; }#ifdef HASH_CACHE_IW x = last_nword % l->iw_cache_num; if (l->iw_lw_cache[x] == last_nword) { /* cache hit */ return(l->iw_sc_cache[x]); }#else /* full cache */ if (l->iw_sc_cache[last_nword] != NULL) { /* cache hit */ return(l->iw_sc_cache[last_nword]); } x = last_nword; /* cache mis-hit, calc probs and cache them as new */#endif /* allocate cache memory */ if (l->iw_sc_cache[x] == NULL) {#ifdef UNIGRAM_FACTORING l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->isolatenum);#else l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->startnum);#endif if (l->iw_sc_cache[x] == NULL) { /* malloc failed */ /* clear existing cache, and retry */ max_successor_prob_iw_free(wchmm); jlog("STAT: inter-word LM cache (%dMB) rehashed\n", (l->iw_cache_num * #ifdef UNIGRAM_FACTORING wchmm->isolatenum#else wchmm->startnum#endif ) / 1000 * sizeof(LOGPROB) / 1000);#ifdef UNIGRAM_FACTORING l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->isolatenum);#else l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->startnum);#endif if (l->iw_sc_cache[x] == NULL) { /* malloc failed again? */ j_internal_error("max_successor_prob_iw: cannot malloc\n"); } } } /* calc prob for all startid */#ifdef UNIGRAM_FACTORING for (j=0;j<wchmm->startnum;j++) { i = wchmm->start2isolate[j]; if (i == -1) continue; node = wchmm->startnode[j]; if (wchmm->state[node].scid <= 0) { /* should not happen!!! below is just for debugging */ j_internal_error("max_successor_prob_iw: isolated (not shared) tree root node has unigram factoring value??\n"); } else { w = (wchmm->sclist[wchmm->state[node].scid])->word; if (wchmm->ngram) { p = (*(wchmm->ngram->bigram_prob))(wchmm->ngram, last_nword, wchmm->winfo->wton[w])#ifdef CLASS_NGRAM + wchmm->winfo->cprob[w]#endif ; } else { p = LOG_ZERO; } if (wchmm->lmvar == LM_NGRAM_USER) { p = (*(wchmm->bi_prob_user))(wchmm->winfo, lastword, w, p); } l->iw_sc_cache[x][i] = p; } }#else /* ~UNIGRAM_FACTORING */ for (i=0;i<wchmm->startnum;i++) { node = wchmm->startnode[i]; l->iw_sc_cache[x][i] = calc_successor_prob(wchmm, lastword, node); }#endif#ifdef HASH_CACHE_IW l->iw_lw_cache[x] = last_nword;#endif return(l->iw_sc_cache[x]);}/** * <JA> * @brief 矢恕による帽胳柒疯年弄 factoring * * Julian において CATEGORY_TREE が年盗されているとき∈デフォルト∷· * 腾菇陇步辑今はカテゴリ帽疤∈すなわち菇矢扩腆の淡揭帽疤∷で菇蜜されるため· * 妈1パスでの咐胳モデルであるカテゴリ滦扩腆は帽胳の幌姜眉で努脱できる. * * この CATEGORY_TREE が年盗されていない眷圭·腾菇陇步辑今は * 辑今链挛で帽办の腾が侯られるため·カテゴリ滦扩腆は N-gram (Julius) と * 票屯に帽胳柒で factoring と票屯の怠菇で努脱される涩妥がある. * * この簇眶は CATEGORY_TREE が年盗されていないときに·惧淡の factoring * ∈疯年弄 factoring と钙ばれる∷を乖なうために捏丁されている. * * @param wchmm [in] 腾菇陇步辑今 * @param lastword [in] 木涟帽胳 * @param node [in] ノ〖ド戎规 * * @return カテゴリ扩腆惧その晦への莲败が钓されれば TRUE, 稍材墙であれば FALSE * </JA> * <EN> * @brief Deterministic factoring for grammar-based recognition (Julian) * * If CATEGORY_TREE is defined (this is default) on Julian, the tree lexicon * will be organized per category and the category-pair constraint used * in the 1st pass can be applied statically at cross-word transition. * * If the CATEGORY_TREE is not defined, a single tree lexicon will be * constucted for a whole dictionary. In this case, the category-pair * constraint should be applied dynamically in the word-internal transition, * like the factoring scheme with N-gram (Julius). * * This function provides such word-internal factoring for grammar-based * recognition (called deterministic factoring) when CATEGORY_TREE is * undefined in Julian. * * @param wchmm [in] tree lexicon * @param lastword [in] last word * @param node [in] node ID to check the constraint * * @return TRUE if the transition to the branch is allowed on the category-pair * constraint, or FALSE if not allowed. * </EN> * * @callgraph * @callergraph * */booleancan_succeed(WCHMM_INFO *wchmm, WORD_ID lastword, int node){ int lc; S_CELL *sc; /* return TRUE if at least one subtree word can connect */ if (lastword == WORD_INVALID) { /* case at beginning-of-word */ for (sc=wchmm->sclist[wchmm->state[node].scid];sc;sc=sc->next) { if (dfa_cp_begin(wchmm->dfa, sc->word) == TRUE) return(TRUE); } return(FALSE); } else { lc = wchmm->winfo->wton[lastword]; for (sc=wchmm->sclist[wchmm->state[node].scid];sc;sc=sc->next) { if (dfa_cp(wchmm->dfa, lc, sc->word) == TRUE) return(TRUE); } return(FALSE); }}/* end of file */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -