📄 live_decode_api.h

📁 CMU大名鼎鼎的SPHINX－3大词汇量连续语音识别系统
💻 H
字号:
/* ==================================================================== * Copyright (c) 1999-2004 Carnegie Mellon University.	All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer.  * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * This work was supported in part by funding from the Defense Advanced  * Research Projects Agency and the National Science Foundation of the  * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * */ /************************************************* * CMU ARPA Speech Project * * Copyright (c) 2000 Carnegie Mellon University. * ALL RIGHTS RESERVED. ************************************************* * *  May 14, 2004 *    Created by Yitao Sun (yitao@cs.cmu.edu) based on the live.h created by *    Rita Singh.  The Live Decode API is the new top level API for Sphinx3. *    The goal of the Live Decode API is to provide a well documented and *    comprehensive API to control all aspects of the Sphinx3 speech decoder *    engine. * *    The return values, for example, hypothesis segments and string, unlike *    the rest of Sphinx3, are read-only, maintained internally, and clobbered *    by subsequent calls. *//*revision 1.9date: 2004/09/03 21:45:26;  author: yitao;  state: Exp;  lines: +2 -2cleaning up remote_decode API by moving list operations into a list API----------------------------revision 1.8date: 2004/09/03 16:50:56;  author: yitao;  state: Exp;  lines: +108 -37modified comments to suit the use of doc++-----------------------------revision 1.7date: 2004/08/27 05:22:43;  author: yitao;  state: Exp;  lines: +75 -105removed remote-decode API from the linux compile.  added doc++ comments for live_decode.h-----------------------------revision 1.6date: 2004/08/25 20:44:31;  author: yitao;  state: Exp;  lines: +13 -151.  added code to record uttid in live-decode2.  added more code to flesh out remote-decode.  not compiling yet.----------------------------revision 1.5date: 2004/08/23 20:41:38;  author: yitao;  state: Exp;  lines: +1 -11basic implementation for remote-decode API.  not compiling yet.----------------------------revision 1.4date: 2004/08/19 19:12:50;  author: yitao;  state: Exp;  lines: +1 -1incompleted files remote-decode API.----------------------------revision 1.3date: 2004/08/09 21:40:36;  author: yitao;  state: Exp;  lines: +11 -201.  fixed some bugs in Live-Decode API.  changed kb.c, kb.h, utt.c, live_decode.c, live_decode.h.2.  changed some filenames in src/programs/.  now there are 2 sets of livedecode and livepretend: one that uses the old API (livedecode and livepretend), and one that uses the new API (livedecode2 and livepretend2).3.  modified Makefile.am to reflect the filename changes above.----------------------------revision 1.2date: 2004/08/08 23:34:50;  author: arthchan2003;  state: Exp;  lines: +1 -1temporary fixes of live_decode.c and live_decode.h----------------------------revision 1.1date: 2004/08/06 15:07:38;  author: yitao;  state: Exp;*** empty log message ***=============================================================================*/#ifndef __LIVE_DECODE_H#define __LIVE_DECODE_H#include "kb.h"#include "fe.h"/** \file live_decode_API.h * \brief header for live mode decoding API  */#ifdef __cplusplusextern "C" {#endif#define MAX_CEP_LEN				64#define LD_SUCCESS				0#define LD_ERROR_OUT_OF_MEMORY			-0x01#define LD_ERROR_NULL_POINTER			-0x02#define LD_ERROR_INVALID_STATE			-0x04#define LD_ERROR_INTERNAL			-0x08#define LD_STATE_IDLE				0#define LD_STATE_DECODING			1#define LD_STATE_FINISHED			2  /** Wrapper structure for live-mode recognition   */typedef struct{  /**   * Knowledge base.   */  kb_t kb;  /**   * Pointer to the knowledge base core.   */  kbcore_t *kbcore;  /**   * Pointer to the front-end.   */  fe_t *fe;  /**   * File pointer to the HMM logfile.   */  FILE *hmm_log;  /**   * Parameter: intervals at which wbeam is used for phone transitions.   */  int32 phones_skip;  /**   * Number of frames decoded.   */  int32 num_frames_decoded;  /**   * Number of frames entered.   */  int32 num_frames_entered;  /**   * Current state of the live decoder.   */  int32 ld_state;  /**   * UTTID (obviously NOT) filled in by knowledge-base.   */  char *uttid;  /**   * The frame number at which the hypothesis is recorded.   */  int32 hyp_frame_num;  /**   * Hypothesis string.  Result (or partial result) of the recognition is   * stored as a complete string.   */  char *hyp_str;  /**   * Hypothesis word segments.  Result (or partial result) of the recognition   * is stored as word segments.  Null-terminated array.   */  hyp_t **hyp_segs;  /**   * Boolean indicator whether we've internally allocated space for the   * command line arguments.   */  int32 internal_cmdln;  /**   * Boolean indicates whether we will internally swap the samples.    */  int32 swap;  /**   * Boolean indicates whether a partial hypothesis will be dumped.    */  int32 phypdump;  /**   * Feature buffer.  Re-allocation of feature buffer is quite expensive.  So   * we allocate once per live decoder.   */   float32 ***features;  /**   * Extenstion for the raw director    */  char* rawext;} live_decoder_t;/** Initializes the live-decoder.  Internal modules, including the cepstra-    generating front-end, the language model, and the accoustic models are    initialized, and live-decoder internal variables are set to a starting    state.    This version of the live-decoder assumes the user has externally parsed    arguments using <I>cmd_ln_parse()</I> or <I>cmd_ln_parse_file()</I>.  The    user is responsible for calling <I>cmd_ln_free()</I> when he/she is done    with the live-decoder.    @param decoder Pointer to the decoder.    @return 0 for success.  -1 for failure.    @see ld_finish*/int ld_init(live_decoder_t *_decoder);/** Initializes the live-decoder.  Internal modules, including the cepstrum-    generating front-end, the language model, and the accoustic models are    initialized, and live-decoder internal variables are set to a starting    state.        This version uses the <I>cmd_ln.h</I> interface internally.  Arguments are    parsed and stored internally, and freed later when    <I>{@link ld_finish ld_finish()}</I> is called.    @param decoder Pointer to the decoder.    @param argc Argument count.    @param argv Argument string array.    @return 0 for success.  -1 for failure.    @see ld_finish*/int ld_init_with_args(live_decoder_t *_decoder, int _argc, char **_argv);/** Wraps up the live-decoder.  All internal modules are closed or unloaded.    Internal variables are either freed or set to a finishing state.  This    function should be called once the user is finished with the live-decoder.    @param decoder Pointer to the decoder.    @see ld_init    @see ld_init_with_args*/void ld_finish(live_decoder_t *_decoder);/** Marks the start of the current utterance.  An utterance is a session of    speech decoding that starts with a call to <I>ld_begin_utt()</I> and ends     with a call to <I>{@link ld_end_utt ld_end_utt()}</I>.  In the duration of     an utterance, speech data is processed with either    <I>{@link ld_process_raw ld_process_raw()}</I> or    <I>{@link ld_process_ceps ld_process_ceps()}</I>.  Decoding results    (hypothesis) can be retrieved any time after the start of an utterance    using <I>{@link ld_retrieve_hyps ld_retrieve_hyps()}</I>.  However, all     previous results will be clobbered at the start of a new utterance.    At the moment, there is an undocumented time limit to the length of an    utterance.    @param decoder Pointer to the decoder.    @param uttid Utterance ID string.  If <I>null</I>, the utterance ID is    ignored.    @return 0 for success.  -1 for failure.    @see ld_end_utt    @see ld_process_raw    @see ld_process_ceps    @see ld_retrieve_hyps*/int ld_begin_utt(live_decoder_t *_decoder, char *_uttid);/** Marks the end of the current utterance.  The Live-Decode API can no longer    process speech data until the start of the next utterance.  Any hypothesis    retrieved prior to the end of the utterance is called a partial hypothesis.    Any hypothesis retrieved after the end of the utterance is called the final    hypothesis.  See <I>{@link ld_retrieve_hyps ld_retrieve_hyps()}</I> on how    to retrieve hypothesis.    @param decoder Pointer to the decoder    @see ld_begin_utt    @see ld_process_raw    @see ld_process_ceps    @see ld_retrieve_hyps*/void ld_end_utt(live_decoder_t *_decoder);/** Process raw 16-bit samples for the current utterance decoding.  This    function has to be called in the duration of an utterance.  That is,    in between calls to <I>{@link ld_begin_utt ld_begin_utt()}</I> and     <I>{@link ld_end_utt ld_end_utt()}</I>.    @param decoder Pointer to the decoder.    @param samples Buffer of int16 audio samples.    @param num_samples Number of samples in the buffer.    @see ld_begin_utt    @see ld_end_utt    @see ld_process_ceps*/void ld_process_raw(live_decoder_t *_decoder, 		    int16 *_samples,		    int32 _num_samples);  /** Process a buffer of cepstrum frames for the current utterance.  To use    this function, make sure that the parameters to the cepstra-generating    front-end that matches the parameters to the decoder's accoustic     model.  This  function has to be called in the duration of an utterance.    That is, in between calls to <I>{@link ld_begin_utt ld_begin_utt()}</I> and    <I>{@link ld_end_utt ld_end_utt()}</I>.    @param decoder Pointer to the decoder.    @param frames Buffer of audio feature frames.    @param num_frames Number of frames in the buffer.    @return 0 for success.  -1 for failure.    @see ld_begin_utt    @see ld_end_utt    @see ld_process_ceps*/void ld_process_ceps(live_decoder_t *_decoder, 		     float32 **_frames,		     int32 _num_frames);/** Retrieve partial or final decoding results (hypothesis).  Any    hypothesis retrieved prior to the end of the utterance is called a     partial hypothesis.  Any hypothesis retrieved after the end of the     utterance is called the final hypothesis.  The hypothesis can be    returned in a plain READ-ONLY string and/or an array of READ-ONLY word    segments.  In the plain string result, all filler and end words are    filtered out as well as the pronouciation information.  What is left is a    very readable string representation of the decoding result.  There is no    filtering in the word segment result.    Here is an example on how to use the result returned by    <I>ld_retrieve_hyps</I>:    <PRE>    live_decoder_t d;    char *str;    hyp_t **segs;    ...    ld_retrieve_hyps(&d, &str, &segs);    printf("Decoded string: %s\n", str);    for (; *segs; segs++) {      printf("Word-segment id: %i\n", (*segs)->id);    }    </PRE>        @param decoder Pointer to the decoder.    @param hyp_str Returned pointer to a READ-ONLY string.  If <I>null</I>,    the string is not returned.    @param hyp_segs Returned pointer to a null-terminated array of word    segments.  If <I>null</I>, the array is not returned.    @return 0 for success.  -1 for failure.*/int ld_retrieve_hyps(live_decoder_t *_decoder, char **_uttid, char **_hyp_str,		     hyp_t ***_hyp_segs);/** Abort the current decoding process immediately.  As opposed to    <I>{@link ld_end_utt ld_end_utt()}</I>.  Retrieving the hypothesis after an    abort is not guaranteed.    <EM>!!! NOT IMPLEMENTED YET !!!</EM>    @param decoder Pointer to the decoder.    @see ld_end_utt*/void ld_abort_utt(live_decoder_t *_decoder);#ifdef __cplusplus}#endif#endif
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -