📄 recog.h

📁 julius version 4.12.about sound recognition.
💻 H
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/** * @file   recog.h *  * <JA> * @brief  エンジンインスタンスの年盗 * * 千急エンジンのインスタンス年盗を乖いますˉインスタンスは· * Recog をトップインスタンスとして·蝗脱する不读モデル·咐胳モデル· * それらを寥み圭わせた千急借妄インスタンスを剩眶积ちますˉ * * 称婶のインスタンスは·滦炳する jconf 柒の肋年菇陇挛·および * 蝗脱するサブインスタンスへのポインタを积ちますˉPROCESS_AM は不读モデル· * PROCESS_LM は咐胳モデルごとに年盗されますˉ * * MFCCCalc は· * 不读モデルおよび GMM で妥滇されるパラメ〖タタイプを拇べたのち· * それらを栏喇するのに涩妥なだけ栏喇されますˉ票办のMFCC房および * その戮のフロントエンド借妄掘凤を积つ不读モデルおよびGMMどうしでは * 票じ MFCCCalc が鼎铜されますˉ * * </JA> *  * <EN> * @brief  Enging instance definitions * * This file defines the engine instance and all its sub instances. * The top instance is Recog, and it consists of several * sub instances for LM, AM, and recognition process instances. * * Each sub-instance keeps pointer to corresponding jconf setting * part, and also has pointers to other instances to use. * PROCESS_AM will be generated for each acoustic model, and PROCESS_LM * will be for each language model. * * MFCCCalc will be generated for each required MFCC frontend types * by inspecting all AMs and GMM.  The AM's and GMMs that requires * exactly the same MFCC frontend will share the same MFCC frontend. * * </EN> * * <pre> * Recog *    +- *JCONF *    +- input related work area *    +- MFCCCalc[] (linked list) (generated from HMM + GMM) *    +- PROCESS_AM[] (linked list) *       +- *pointer to JCONF_AM *       +- *pointer to MFCCCalc *       +- hmminfo, hmm_gs *       +- hmmwrk *       +- multipath, ccd_flag, cmn_loaded *    +- PROCESS_LM[] (linked list) *       +- *pointer to JCONF_LM *       +- *pointer to PROCESS_AM *       +- lmtype, lmvar *       +- winfo *       +- ngram or grammars *       +- lmfunc *    +- RecogProcess process[] (linked list) *       +- *pointer to JCONF_SEARCH *       +- *pointer to PROCESS_AM *       +- *pointer to PROCESS_LM *       +- lmtype, lmvar *       +- misc. param *    +- GMMCalc *       +- *JCONF_AM for GMM *       +- *pointer to MFCCCalc * </pre> *  * @author Akinobu Lee * @date   Fri Feb 16 13:42:28 2007 * * $Revision: 1.7 $ *  *//* * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology * All rights reserved *//**/#ifndef __J_RECOG_H__#define __J_RECOG_H__#include <sent/stddefs.h>#include <sent/hmm.h>#include <sent/vocabulary.h>#include <sent/ngram2.h>#include <sent/dfa.h>#include <julius/wchmm.h>#include <julius/search.h>#include <julius/callback.h>#include <julius/jconf.h>/*  How tokens are managed:   o  tlist[][] is a token stocker.  It holds all tokens in sequencial      buffer.  They are malloced first on startup, and refered by ID while      Viterbi procedure.  In word-pair mode, each token also has a link to      another token to allow a node to have more than 1 token.         o  token[n] holds the current ID number of a token associated to a      lexicon tree node 'n'.  *//** * Work area for the first pass *  */typedef struct __FSBeam__ {  /* token stocker */  TOKEN2 *tlist[2];     ///< Token space to hold all token entities.  TOKENID *tindex[2];   ///< Token index corresponding to @a tlist for sort  int maxtnum;          ///< Allocated number of tokens (will grow)  int expand_step;      ///< Number of tokens to be increased per expansion  boolean expanded;     ///< TRUE if the tlist[] and tindex[] has been expanded at last create_token();  int tnum[2];          ///< Current number of tokens used in @a tlist  int n_start;          ///< Start index of in-beam nodes on @a tindex  int n_end;            ///< end index of in-beam nodes on @a tindex  int tl;               ///< Current work area id (0 or 1, swapped for each frame)  int tn;               ///< Next work area id (0 or 1, swapped for each frame)      /* Active token list */  TOKENID *token;       ///< Active token list that holds currently assigned tokens for each tree node#ifdef UNIGRAM_FACTORING  /* for wordend processing with 1-gram factoring */  LOGPROB wordend_best_score; ///< Best score of word-end nodes  int wordend_best_node;        ///< Node id of the best wordend nodes  TRELLIS_ATOM *wordend_best_tre; ///< Trellis word corresponds to above  WORD_ID wordend_best_last_cword;      ///< Last context-aware word of above#endif  int totalnodenum;     ///< Allocated number of nodes in @a token  TRELLIS_ATOM bos;     ///< Special token for beginning-of-sentence  boolean nodes_malloced; ///< Flag to check if tokens already allocated  LOGPROB lm_weight;           ///< Language score weight (local copy)  LOGPROB lm_penalty;          ///< Word insertion penalty (local copy)  LOGPROB lm_penalty_trans; ///< Additional insertion penalty for transparent words (local copy)  LOGPROB penalty1; ///< Word insertion penalty for DFA (local copy)#if defined(WPAIR) && defined(WPAIR_KEEP_NLIMIT)  boolean wpair_keep_nlimit; ///< Keeps only N token on word-pair approx. (local copy from jconf)#endif  /* for short-pause segmentation */  boolean in_sparea;         ///< TRUE when we are in a pause area now  int tmp_sparea_start;         ///< Memorize where the current pause area begins#ifdef SP_BREAK_RESUME_WORD_BEGIN  WORD_ID tmp_sp_break_last_word; ///< Keep the max word hypothesis at beginning of this segment as the starting word of next segment#else  WORD_ID last_tre_word;        ///< Keep ths max word hypothesis at the end of this segment for as the starting word of the next segment#endif  boolean first_sparea;  ///< TRUE when we are in the first pause area  int sp_duration;   ///< Number of current successive sp frame#ifdef SPSEGMENT_NAIST  boolean after_trigger;        ///< TRUE if speech already triggered   int trigger_duration;         ///< Current speech duration at uptrigger detection  boolean want_rewind;          ///< TRUE if process wants mfcc rewinding  int rewind_frame;             ///< Place to rewind to  boolean want_rewind_reprocess; ///< TRUE if requires re-processing after rewind#endif  char *pausemodelnames;        ///< pause model name string to detect segment  char **pausemodel;            ///< each pause model name to detect segment  int pausemodelnum;            ///< num of pausemodel} FSBeam;/** * Work area for realtime processing of 1st pass *  */typedef struct __RealBeam__ {  /* input parameter */  int maxframelen;              ///< Maximum allowed input frame length  SP16 *window;         ///< Window buffer for MFCC calculation  int windowlen;                ///< Buffer length of @a window  int windownum;                ///< Currently left samples in @a window  /* for short-pause segmentation */  boolean last_is_segmented; ///<  TRUE if last pass was a segmented input  SP16 *rest_Speech; ///< Speech samples left unprocessed by segmentation at previous segment  int rest_alloc_len;   ///< Allocated length of rest_Speech  int rest_len;         ///< Current stored length of rest_Speech} RealBeam;/** * Work area for the 2nd pass *  */typedef struct __StackDecode__ {  int hypo_len_count[MAXSEQNUM+1];      ///< Count of popped hypothesis per each length  int maximum_filled_length; ///< Current least beam-filled depth#ifdef SCAN_BEAM  LOGPROB *framemaxscore; ///< Maximum score of each frame on 2nd pass for score enveloping#endif  NODE *stocker_root; ///< Node stocker for recycle  int popctr;           ///< Num of popped hypotheses from stack  int genectr;          ///< Num of generated hypotheses  int pushctr;          ///< Num of hypotheses actually pushed to stack  int finishnum;        ///< Num of found sentence hypothesis  NODE *current;                ///< Current node for debug#ifdef CONFIDENCE_MEASURE  LOGPROB cm_alpha;             ///< alpha scaling value from jconf# ifdef CM_MULTIPLE_ALPHA  LOGPROB *cmsumlist;        ///< Sum of cm score for each alpha coef.  int cmsumlistlen;             ///< Allocated length of cmsumlist.# endif# ifdef CM_SEARCH  LOGPROB cm_tmpbestscore; ///< Temporal best score for summing up scores#  ifndef CM_MULTIPLE_ALPHA  LOGPROB cm_tmpsum;            ///< Sum of CM score#  endif  int l_stacksize;              ///< Local stack size for CM  int l_stacknum;               ///< Num of hypo. in local stack for CM  NODE *l_start;        ///< Top node of local stack for CM  NODE *l_bottom;       ///< bottom node of local stack for CM# endif# ifdef CM_NBEST  LOGPROB *sentcm = NULL;       ///< Confidence score of each sentence  LOGPROB *wordcm = NULL;       ///< Confidence score of each word voted from @a sentcm  int sentnum;          ///< Allocated length of @a sentcm# endif#endif /* CONFIDENCE_MEASURE */  LOGPROB *wordtrellis[2]; ///< Buffer to compute viterbi path of a word  LOGPROB *g;           ///< Buffer to hold source viterbi scores  HMM_Logical **phmmseq;        ///< Phoneme sequence to be computed  int phmmlen_max;              ///< Maximum length of @a phmmseq.  boolean *has_sp;              ///< Mark which phoneme allow short pause for multi-path mode#ifdef GRAPHOUT_PRECISE_BOUNDARY  short *wend_token_frame[2]; ///< Propagating token of word-end frame to detect corresponding end-of-words at word head  LOGPROB *wend_token_gscore[2]; ///< Propagating token of scores at word-end to detect corresponding end-of-words at word head  short *wef;           ///< Work area for word-end frame tokens for v2  LOGPROB *wes;         ///< Work area for word-end score tokens for v2#endif} StackDecode;/** * User LM function entry point *  */typedef struct {  LOGPROB (*uniprob)(WORD_INFO *, WORD_ID, LOGPROB); ///< Pointer to function returning word occurence probability  LOGPROB (*biprob)(WORD_INFO *, WORD_ID, WORD_ID, LOGPROB); ///< Pointer to function returning a word probability given a word context (corresponds to bi-gram)  LOGPROB (*lmprob)(WORD_INFO *, WORD_ID *, int, WORD_ID, LOGPROB); ///< Pointer to function returning LM probability} LMFunc;/** * Work area for GMM calculation *  */typedef struct __gmm_calc__{  LOGPROB *gmm_score;   ///< Current accumurated scores for each GMM  boolean *is_voice;            ///< True if corresponding model designates speech, FALSE if noise  int framecount;               ///< Current frame count  short OP_nstream;             ///< Number of input stream for GMM  VECT *OP_vec_stream[MAXSTREAMNUM]; ///< input vector for each stream at that frame  short OP_veclen_stream[MAXSTREAMNUM]; ///< vector length for each stream  LOGPROB *OP_calced_score; ///< Work area for Gaussian pruning on GMM: scores  int *OP_calced_id; ///< Work area for Gaussian pruning on GMM: id  int OP_calced_num; ///< Work area for Gaussian pruning on GMM: number of above  int OP_calced_maxnum; ///< Work area for Gaussian pruning on GMM: size of allocated area  int OP_gprune_num; ///< Number of Gaussians to be computed in Gaussian pruning  VECT *OP_vec;         ///< Local workarea to hold the input vector of current frame  short OP_veclen;              ///< Local workarea to hold the length of above  HTK_HMM_Data *max_d;  ///< Hold model of the maximum score  int max_i;                    ///< Index of max_d#ifdef CONFIDENCE_MEASURE  LOGPROB gmm_max_cm;   ///< Hold maximum score#endif#ifdef GMM_VAD  LOGPROB *rates;   ///< voice rate of recent N frames (cycle buffer)  int nframe;                   ///< Length of rates  boolean filled;  int framep;                   ///< Current frame pointer  boolean in_voice;             ///< TRUE if currently in voice area  boolean up_trigger;           ///< TRUE when detect up trigger  boolean down_trigger;         ///< TRUE when detect down trigger  boolean after_trigger;        ///< TRUE when currently we are processing speech segment  boolean want_rewind;          ///< TRUE if GMM wants rewinding its MFCC  boolean want_rewind_reprocess; ///< TRUE if GMM wants re-processing after rewind  int rewind_frame;             ///< Frame to rewind  int duration;                 ///< Current GMM duration work#endif} GMMCalc;/** * Alignment result, valid when forced alignment was done *  */typedef struct __sentence_align__ {  int num;                    ///< Number of units  short unittype;             ///< Unit type (one of PER_*)  WORD_ID *w;                 ///< word sequence by id (PER_WORD)  HMM_Logical **ph;     ///< Phone sequence (PER_PHONEME, PER_STATE)  short *loc; ///< sequence of state location in a phone (PER_STATE)  boolean *is_iwsp;           ///< TRUE if PER_STATE and this is the inter-word pause state at multipath mode  int *begin_frame;           ///< List of beginning frame  int *end_frame;             ///< List of ending frame  LOGPROB *avgscore;          ///< Score averaged by frames  LOGPROB allscore;           ///< Re-computed acoustic score  struct __sentence_align__ *next; ///< data chain pointer} SentenceAlign;/** * Output result structure *  */typedef struct __sentence__ {  WORD_ID word[MAXSEQNUM];      ///< Sequence of word ID   int word_num;                 ///< Number of words in the sentence  LOGPROB score;                ///< Likelihood (LM+AM)  LOGPROB confidence[MAXSEQNUM]; ///< Word confidence scores  LOGPROB score_lm;             ///< Language model likelihood (scaled) for N-gram  LOGPROB score_am;             ///< Acoustic model likelihood for N-gram  int gram_id;                  ///< The grammar ID this sentence belongs to for DFA  SentenceAlign *align;} Sentence;/**  * A/D-in work area *  */typedef struct __adin__ {  /* functions */  /// Pointer to function for device initialization (call once on startup)  boolean (*ad_standby)(int, void *);  /// Pointer to function to open audio stream for capturing  boolean (*ad_begin)();  /// Pointer to function to close audio stream capturing  boolean (*ad_end)();  /// Pointer to function to begin / restart recording  boolean (*ad_resume)();  /// Pointer to function to pause recording  boolean (*ad_pause)();  /// Pointer to function to terminate current recording immediately  boolean (*ad_terminate)();  /// Pointer to function to read samples  int (*ad_read)(SP16 *, int);  /* configuration parameters */  int thres;            ///< Input Level threshold (0-32767)  int noise_zerocross;  ///< Computed threshold of zerocross num in the cycle buffer  int nc_max;           ///< Computed number of fragments for tail margin  boolean adin_cut_on;  ///< TRUE if do input segmentation by silence  boolean silence_cut_default; ///< Device-dependent default value of adin_cut_on()  boolean strip_flag;   ///< TRUE if skip invalid zero samples  boolean enable_thread;        ///< TRUE if input device needs threading  boolean need_zmean;   ///< TRUE if perform zmeansource  /* work area */  int c_length; ///< Computed length of cycle buffer for zero-cross, actually equals to head margin length  int c_offset; ///< Static data DC offset (obsolute, should be 0)  SP16 *swapbuf;                ///< Buffer for re-triggering in tail margin  int sbsize;    ///< Size of @a swapbuf  int sblen;    ///< Current length of @a swapbuf  int rest_tail;                ///< Samples not processed yet in swap buffer  ZEROCROSS zc;                 ///< Work area for zero-cross computation#ifdef HAVE_PTHREAD  /* Variables related to POSIX threading */  pthread_t adin_thread;	///< Thread information  pthread_mutex_t mutex;        ///< Lock primitive  SP16 *speech;         ///< Unprocessed samples recorded by A/D-in thread  int speechlen;                ///< Current length of @a speech/* * Semaphore to start/stop recognition. *  * If TRUE, A/D-in thread will store incoming samples to @a speech and * main thread will detect and process them. * If FALSE, A/D-in thread will still get input and check trigger as the same * as TRUE case, but does not store them to @a speech. *  */  boolean transfer_online;  /**   * TRUE if buffer overflow occured in adin thread.   *    */  boolean adinthread_buffer_overflowed;  /**   * TRUE if adin thread ended   *    */  boolean adinthread_ended;  boolean ignore_speech_while_recog; ///< TRUE if ignore speech input between call, while waiting recognition process#endif  /* Input data buffer */  SP16 *buffer; ///< Temporary buffer to hold input samples
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -