📄 recog.h
字号:
/** * @file recog.h * * <JA> * @brief エンジンインスタンスの年盗 * * 千急エンジンのインスタンス年盗を乖いますˉインスタンスは· * Recog をトップインスタンスとして·蝗脱する不读モデル·咐胳モデル· * それらを寥み圭わせた千急借妄インスタンスを剩眶积ちますˉ * * 称婶のインスタンスは·滦炳する jconf 柒の肋年菇陇挛·および * 蝗脱するサブインスタンスへのポインタを积ちますˉPROCESS_AM は不读モデル· * PROCESS_LM は咐胳モデルごとに年盗されますˉ * * MFCCCalc は· * 不读モデルおよび GMM で妥滇されるパラメ〖タタイプを拇べたのち· * それらを栏喇するのに涩妥なだけ栏喇されますˉ票办のMFCC房および * その戮のフロントエンド借妄掘凤を积つ不读モデルおよびGMMどうしでは * 票じ MFCCCalc が鼎铜されますˉ * * </JA> * * <EN> * @brief Enging instance definitions * * This file defines the engine instance and all its sub instances. * The top instance is Recog, and it consists of several * sub instances for LM, AM, and recognition process instances. * * Each sub-instance keeps pointer to corresponding jconf setting * part, and also has pointers to other instances to use. * PROCESS_AM will be generated for each acoustic model, and PROCESS_LM * will be for each language model. * * MFCCCalc will be generated for each required MFCC frontend types * by inspecting all AMs and GMM. The AM's and GMMs that requires * exactly the same MFCC frontend will share the same MFCC frontend. * * </EN> * * <pre> * Recog * +- *JCONF * +- input related work area * +- MFCCCalc[] (linked list) (generated from HMM + GMM) * +- PROCESS_AM[] (linked list) * +- *pointer to JCONF_AM * +- *pointer to MFCCCalc * +- hmminfo, hmm_gs * +- hmmwrk * +- multipath, ccd_flag, cmn_loaded * +- PROCESS_LM[] (linked list) * +- *pointer to JCONF_LM * +- *pointer to PROCESS_AM * +- lmtype, lmvar * +- winfo * +- ngram or grammars * +- lmfunc * +- RecogProcess process[] (linked list) * +- *pointer to JCONF_SEARCH * +- *pointer to PROCESS_AM * +- *pointer to PROCESS_LM * +- lmtype, lmvar * +- misc. param * +- GMMCalc * +- *JCONF_AM for GMM * +- *pointer to MFCCCalc * </pre> * * @author Akinobu Lee * @date Fri Feb 16 13:42:28 2007 * * $Revision: 1.7 $ * *//* * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology * All rights reserved *//**/#ifndef __J_RECOG_H__#define __J_RECOG_H__#include <sent/stddefs.h>#include <sent/hmm.h>#include <sent/vocabulary.h>#include <sent/ngram2.h>#include <sent/dfa.h>#include <julius/wchmm.h>#include <julius/search.h>#include <julius/callback.h>#include <julius/jconf.h>/* How tokens are managed: o tlist[][] is a token stocker. It holds all tokens in sequencial buffer. They are malloced first on startup, and refered by ID while Viterbi procedure. In word-pair mode, each token also has a link to another token to allow a node to have more than 1 token. o token[n] holds the current ID number of a token associated to a lexicon tree node 'n'. *//** * Work area for the first pass * */typedef struct __FSBeam__ { /* token stocker */ TOKEN2 *tlist[2]; ///< Token space to hold all token entities. TOKENID *tindex[2]; ///< Token index corresponding to @a tlist for sort int maxtnum; ///< Allocated number of tokens (will grow) int expand_step; ///< Number of tokens to be increased per expansion boolean expanded; ///< TRUE if the tlist[] and tindex[] has been expanded at last create_token(); int tnum[2]; ///< Current number of tokens used in @a tlist int n_start; ///< Start index of in-beam nodes on @a tindex int n_end; ///< end index of in-beam nodes on @a tindex int tl; ///< Current work area id (0 or 1, swapped for each frame) int tn; ///< Next work area id (0 or 1, swapped for each frame) /* Active token list */ TOKENID *token; ///< Active token list that holds currently assigned tokens for each tree node#ifdef UNIGRAM_FACTORING /* for wordend processing with 1-gram factoring */ LOGPROB wordend_best_score; ///< Best score of word-end nodes int wordend_best_node; ///< Node id of the best wordend nodes TRELLIS_ATOM *wordend_best_tre; ///< Trellis word corresponds to above WORD_ID wordend_best_last_cword; ///< Last context-aware word of above#endif int totalnodenum; ///< Allocated number of nodes in @a token TRELLIS_ATOM bos; ///< Special token for beginning-of-sentence boolean nodes_malloced; ///< Flag to check if tokens already allocated LOGPROB lm_weight; ///< Language score weight (local copy) LOGPROB lm_penalty; ///< Word insertion penalty (local copy) LOGPROB lm_penalty_trans; ///< Additional insertion penalty for transparent words (local copy) LOGPROB penalty1; ///< Word insertion penalty for DFA (local copy)#if defined(WPAIR) && defined(WPAIR_KEEP_NLIMIT) boolean wpair_keep_nlimit; ///< Keeps only N token on word-pair approx. (local copy from jconf)#endif /* for short-pause segmentation */ boolean in_sparea; ///< TRUE when we are in a pause area now int tmp_sparea_start; ///< Memorize where the current pause area begins#ifdef SP_BREAK_RESUME_WORD_BEGIN WORD_ID tmp_sp_break_last_word; ///< Keep the max word hypothesis at beginning of this segment as the starting word of next segment#else WORD_ID last_tre_word; ///< Keep ths max word hypothesis at the end of this segment for as the starting word of the next segment#endif boolean first_sparea; ///< TRUE when we are in the first pause area int sp_duration; ///< Number of current successive sp frame#ifdef SPSEGMENT_NAIST boolean after_trigger; ///< TRUE if speech already triggered int trigger_duration; ///< Current speech duration at uptrigger detection boolean want_rewind; ///< TRUE if process wants mfcc rewinding int rewind_frame; ///< Place to rewind to boolean want_rewind_reprocess; ///< TRUE if requires re-processing after rewind#endif char *pausemodelnames; ///< pause model name string to detect segment char **pausemodel; ///< each pause model name to detect segment int pausemodelnum; ///< num of pausemodel} FSBeam;/** * Work area for realtime processing of 1st pass * */typedef struct __RealBeam__ { /* input parameter */ int maxframelen; ///< Maximum allowed input frame length SP16 *window; ///< Window buffer for MFCC calculation int windowlen; ///< Buffer length of @a window int windownum; ///< Currently left samples in @a window /* for short-pause segmentation */ boolean last_is_segmented; ///< TRUE if last pass was a segmented input SP16 *rest_Speech; ///< Speech samples left unprocessed by segmentation at previous segment int rest_alloc_len; ///< Allocated length of rest_Speech int rest_len; ///< Current stored length of rest_Speech} RealBeam;/** * Work area for the 2nd pass * */typedef struct __StackDecode__ { int hypo_len_count[MAXSEQNUM+1]; ///< Count of popped hypothesis per each length int maximum_filled_length; ///< Current least beam-filled depth#ifdef SCAN_BEAM LOGPROB *framemaxscore; ///< Maximum score of each frame on 2nd pass for score enveloping#endif NODE *stocker_root; ///< Node stocker for recycle int popctr; ///< Num of popped hypotheses from stack int genectr; ///< Num of generated hypotheses int pushctr; ///< Num of hypotheses actually pushed to stack int finishnum; ///< Num of found sentence hypothesis NODE *current; ///< Current node for debug#ifdef CONFIDENCE_MEASURE LOGPROB cm_alpha; ///< alpha scaling value from jconf# ifdef CM_MULTIPLE_ALPHA LOGPROB *cmsumlist; ///< Sum of cm score for each alpha coef. int cmsumlistlen; ///< Allocated length of cmsumlist.# endif# ifdef CM_SEARCH LOGPROB cm_tmpbestscore; ///< Temporal best score for summing up scores# ifndef CM_MULTIPLE_ALPHA LOGPROB cm_tmpsum; ///< Sum of CM score# endif int l_stacksize; ///< Local stack size for CM int l_stacknum; ///< Num of hypo. in local stack for CM NODE *l_start; ///< Top node of local stack for CM NODE *l_bottom; ///< bottom node of local stack for CM# endif# ifdef CM_NBEST LOGPROB *sentcm = NULL; ///< Confidence score of each sentence LOGPROB *wordcm = NULL; ///< Confidence score of each word voted from @a sentcm int sentnum; ///< Allocated length of @a sentcm# endif#endif /* CONFIDENCE_MEASURE */ LOGPROB *wordtrellis[2]; ///< Buffer to compute viterbi path of a word LOGPROB *g; ///< Buffer to hold source viterbi scores HMM_Logical **phmmseq; ///< Phoneme sequence to be computed int phmmlen_max; ///< Maximum length of @a phmmseq. boolean *has_sp; ///< Mark which phoneme allow short pause for multi-path mode#ifdef GRAPHOUT_PRECISE_BOUNDARY short *wend_token_frame[2]; ///< Propagating token of word-end frame to detect corresponding end-of-words at word head LOGPROB *wend_token_gscore[2]; ///< Propagating token of scores at word-end to detect corresponding end-of-words at word head short *wef; ///< Work area for word-end frame tokens for v2 LOGPROB *wes; ///< Work area for word-end score tokens for v2#endif} StackDecode;/** * User LM function entry point * */typedef struct { LOGPROB (*uniprob)(WORD_INFO *, WORD_ID, LOGPROB); ///< Pointer to function returning word occurence probability LOGPROB (*biprob)(WORD_INFO *, WORD_ID, WORD_ID, LOGPROB); ///< Pointer to function returning a word probability given a word context (corresponds to bi-gram) LOGPROB (*lmprob)(WORD_INFO *, WORD_ID *, int, WORD_ID, LOGPROB); ///< Pointer to function returning LM probability} LMFunc;/** * Work area for GMM calculation * */typedef struct __gmm_calc__{ LOGPROB *gmm_score; ///< Current accumurated scores for each GMM boolean *is_voice; ///< True if corresponding model designates speech, FALSE if noise int framecount; ///< Current frame count short OP_nstream; ///< Number of input stream for GMM VECT *OP_vec_stream[MAXSTREAMNUM]; ///< input vector for each stream at that frame short OP_veclen_stream[MAXSTREAMNUM]; ///< vector length for each stream LOGPROB *OP_calced_score; ///< Work area for Gaussian pruning on GMM: scores int *OP_calced_id; ///< Work area for Gaussian pruning on GMM: id int OP_calced_num; ///< Work area for Gaussian pruning on GMM: number of above int OP_calced_maxnum; ///< Work area for Gaussian pruning on GMM: size of allocated area int OP_gprune_num; ///< Number of Gaussians to be computed in Gaussian pruning VECT *OP_vec; ///< Local workarea to hold the input vector of current frame short OP_veclen; ///< Local workarea to hold the length of above HTK_HMM_Data *max_d; ///< Hold model of the maximum score int max_i; ///< Index of max_d#ifdef CONFIDENCE_MEASURE LOGPROB gmm_max_cm; ///< Hold maximum score#endif#ifdef GMM_VAD LOGPROB *rates; ///< voice rate of recent N frames (cycle buffer) int nframe; ///< Length of rates boolean filled; int framep; ///< Current frame pointer boolean in_voice; ///< TRUE if currently in voice area boolean up_trigger; ///< TRUE when detect up trigger boolean down_trigger; ///< TRUE when detect down trigger boolean after_trigger; ///< TRUE when currently we are processing speech segment boolean want_rewind; ///< TRUE if GMM wants rewinding its MFCC boolean want_rewind_reprocess; ///< TRUE if GMM wants re-processing after rewind int rewind_frame; ///< Frame to rewind int duration; ///< Current GMM duration work#endif} GMMCalc;/** * Alignment result, valid when forced alignment was done * */typedef struct __sentence_align__ { int num; ///< Number of units short unittype; ///< Unit type (one of PER_*) WORD_ID *w; ///< word sequence by id (PER_WORD) HMM_Logical **ph; ///< Phone sequence (PER_PHONEME, PER_STATE) short *loc; ///< sequence of state location in a phone (PER_STATE) boolean *is_iwsp; ///< TRUE if PER_STATE and this is the inter-word pause state at multipath mode int *begin_frame; ///< List of beginning frame int *end_frame; ///< List of ending frame LOGPROB *avgscore; ///< Score averaged by frames LOGPROB allscore; ///< Re-computed acoustic score struct __sentence_align__ *next; ///< data chain pointer} SentenceAlign;/** * Output result structure * */typedef struct __sentence__ { WORD_ID word[MAXSEQNUM]; ///< Sequence of word ID int word_num; ///< Number of words in the sentence LOGPROB score; ///< Likelihood (LM+AM) LOGPROB confidence[MAXSEQNUM]; ///< Word confidence scores LOGPROB score_lm; ///< Language model likelihood (scaled) for N-gram LOGPROB score_am; ///< Acoustic model likelihood for N-gram int gram_id; ///< The grammar ID this sentence belongs to for DFA SentenceAlign *align;} Sentence;/** * A/D-in work area * */typedef struct __adin__ { /* functions */ /// Pointer to function for device initialization (call once on startup) boolean (*ad_standby)(int, void *); /// Pointer to function to open audio stream for capturing boolean (*ad_begin)(); /// Pointer to function to close audio stream capturing boolean (*ad_end)(); /// Pointer to function to begin / restart recording boolean (*ad_resume)(); /// Pointer to function to pause recording boolean (*ad_pause)(); /// Pointer to function to terminate current recording immediately boolean (*ad_terminate)(); /// Pointer to function to read samples int (*ad_read)(SP16 *, int); /* configuration parameters */ int thres; ///< Input Level threshold (0-32767) int noise_zerocross; ///< Computed threshold of zerocross num in the cycle buffer int nc_max; ///< Computed number of fragments for tail margin boolean adin_cut_on; ///< TRUE if do input segmentation by silence boolean silence_cut_default; ///< Device-dependent default value of adin_cut_on() boolean strip_flag; ///< TRUE if skip invalid zero samples boolean enable_thread; ///< TRUE if input device needs threading boolean need_zmean; ///< TRUE if perform zmeansource /* work area */ int c_length; ///< Computed length of cycle buffer for zero-cross, actually equals to head margin length int c_offset; ///< Static data DC offset (obsolute, should be 0) SP16 *swapbuf; ///< Buffer for re-triggering in tail margin int sbsize; ///< Size of @a swapbuf int sblen; ///< Current length of @a swapbuf int rest_tail; ///< Samples not processed yet in swap buffer ZEROCROSS zc; ///< Work area for zero-cross computation#ifdef HAVE_PTHREAD /* Variables related to POSIX threading */ pthread_t adin_thread; ///< Thread information pthread_mutex_t mutex; ///< Lock primitive SP16 *speech; ///< Unprocessed samples recorded by A/D-in thread int speechlen; ///< Current length of @a speech/* * Semaphore to start/stop recognition. * * If TRUE, A/D-in thread will store incoming samples to @a speech and * main thread will detect and process them. * If FALSE, A/D-in thread will still get input and check trigger as the same * as TRUE case, but does not store them to @a speech. * */ boolean transfer_online; /** * TRUE if buffer overflow occured in adin thread. * */ boolean adinthread_buffer_overflowed; /** * TRUE if adin thread ended * */ boolean adinthread_ended; boolean ignore_speech_while_recog; ///< TRUE if ignore speech input between call, while waiting recognition process#endif /* Input data buffer */ SP16 *buffer; ///< Temporary buffer to hold input samples
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -