time_align.c

来自「WinCE平台上的语音识别程序」· C语言代码 · 共 2,010 行 · 第 1/5 页
2,010 行
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- *//* ==================================================================== * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer.  * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * This work was supported in part by funding from the Defense Advanced  * Research Projects Agency and the National Science Foundation of the  * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * time_align.c * * Description: *   These routines will time align a given word string to a given acoustic input stream. *   The steps involved in this process are: *   	- The given word string is converted into a triphone model DAG. *	- The viterbi algorithm is applied to find the best state sequence through the DAG *	- State, phone and word level backpointers are maintained in order to retrieve *	  the best state, phone and word level segmentations of the acoustic input stream. * *   Optional silence is allowed between words. * * To Do: *   - allow for arbitrary left and right context begin and end phones.  This would allow *     these routines to do a detailed phonetic analysis in an error region for instance. *   - allow optional filler word sequence between words (e.g. ++COUGH++ ++SNIFF++ SIL ++SNIFF++). * * Revision History *  * 22-Nov-2004	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Modified to use senscr module for senone score evaluation. *  * Revision 1.15  2004/11/13 00:38:43  egouvea * Replaced most printf with E_INFO (or E_WARN or...). Changed the output * of the time_align code so it's consistent with the other decoder modes * (allphone, normal decoding etc). Added the file utt id to the * time_align output. * * Revision 1.14  2004/07/16 00:57:11  egouvea * Added Ravi's implementation of FSG support. * * Revision 1.2  2004/05/27 14:22:57  rkm * FSG cross-word triphones completed (but for single-phone words) * * Revision 1.13  2001/12/11 00:24:48  lenzo * Acknowledgement in License. * * Revision 1.12  2001/12/07 17:30:02  lenzo * Clean up and remove extra lines. * * Revision 1.11  2001/12/07 05:09:30  lenzo * License.xsxc * * Revision 1.10  2001/12/07 04:27:35  lenzo * License cleanup.  Remove conditions on the names.  Rationale: These * conditions don't belong in the license itself, but in other fora that * offer protection for recognizeable names such as "Carnegie Mellon * University" and "Sphinx."  These changes also reduce interoperability * issues with other licenses such as the Mozilla Public License and the * GPL.  This update changes the top-level license files and removes the * old license conditions from each of the files that contained it. * All files in this collection fall under the copyright of the top-level * LICENSE file. * * Revision 1.9  2001/12/07 00:51:49  lenzo * Quiet warnings. * * Revision 1.8  2001/10/23 22:20:30  lenzo * Change error logging and reporting to the E_* macros that call common * functions.  This will obsolete logmsg.[ch] and they will be removed * or changed in future versions. * * Revision 1.7  2001/07/02 16:47:12  lenzo * Fixed triphone lookup fallback case. * * Revision 1.6  2001/02/13 19:51:38  lenzo * *** empty log message *** * * Revision 1.5  2001/01/25 19:36:29  lenzo * Fixing some memory leaks * * Revision 1.4  2000/12/12 23:01:42  lenzo * Rationalizing libs and names some more.  Split a/d and fe libs out. * * Revision 1.3  2000/12/05 01:45:12  lenzo * Restructuring, hear rationalization, warning removal, ANSIfy * * Revision 1.2  2000/03/29 14:30:28  awb * *** empty log message *** * * Revision 1.1.1.1  2000/01/28 22:08:57  lenzo * Initial import of sphinx2 * * *  * 02-Jan-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Added time_align_word flag to determine whether word segmentation is * 		output.  Implemented printing of word and phone segmentations. *  * 02-Jan-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Added time_align_phone and time_align_state flags for determining at run * 		time whether phone-level and state-level backtrace are printed. *  * Revision 1.4  1995/01/27  18:04:18  eht * Fixed a print statement bug * * Revision 1.3  1994/10/13  11:33:12  eht * Fixed handling of single phone words. * * Revision 1.2  1994/09/26  15:58:12  eht * Have time_align control allocation/deallocation of buffers. * Output revision control ID at initialization time * * Revision 1.1  1994/09/26  13:24:45  eht * Initial revision * * Public Interface: * *   time_align_init() *     Initializes the system.  Should be called once before any of the following *     calls are made. *      *   time_align_set_input() *	Provide the speech input for time alignment.  A call to this must precede *	any of the following calls. * *   time_align_word_sequence(char *left_word, char *word_seq, char *right_word) *	Performs the forced recognition against the input.  A call to this routine *	must precede the following calls. * *   char *time_align_best_word_string() *	Returns the best word string associated with the best state sequence found *	by time_align_word_sequence().  Included in this string are filler *	words (sil, silb, sile, noise words). * *	BEWARE: *		The caller must NOT free the returned string. * *   time_align_seg_output(unsigned short **seg, int *seg_cnt) *     Returns the state time alignments in the form of a sequence of shorts *     where: *	        high bit set indicates the first frame of a phone * *		   x = ci_phone_id * 5 + state_id,  where state_id is either *			0, 1, 2, 3, 4 and ci_phone_id is the context independent *			from the phone file given to the system. * *	These data are used by the senone decision tree builder. * *	BEWARE: *		The caller must NOT free the returned seg array. * *   SEGMENT_T *time_align_get_segmentation(int kind, int *seg_cnt) */#define SHOW_NOTHING		(0x0000)#define SHOW_EVERYTHING		(0xffff)#define SHOW_INVOKATION		(0x0001)        /* print function call invokation info */#define SHOW_MODEL_EVAL		(0x0002)        /* print model evaluation trace */#define SHOW_SUMMARY_INFO	(0x0004)        /* print search summary info */#define SHOW_ACTIVE		(0x0008)        /* print the active model state before/after evaluation                                                   scores */#define SHOW_BP			(0x0010)        /* show any new backpointer info for each frame */#define SHOW_FORCED_MODEL	(0x0020)        /* show the topology of the model to be used for                                                   forced recognition */#define SHOW_BEST_WORD_PATH	(0x0040)        /* print the best scoring word alignment */#define SHOW_PHONE_GRAPH	(0x0080)        /* print the phone graph to be searched */#define SHOW_PRUNING		(0x0100)        /* print the models pruned per frame */#define SHOW_MODEL_DAG		(0x0200)        /* print the model dag used */#define SHOW_NODE_EXPANSION	(0x0400)        /* print ci -> triphone expansion information */#define SHOW_SYS_INFO		(0x0800)        /* print memory usage/ system performance figures */#define SHOW_BEST_PHONE_PATH	(0x1000)        /* print the best scoring phone alignment */#define SHOW_BEST_STATE_PATH	(0x2000)        /* print the best scoring state alignment */#define SHOW_BEST_PATHS		(SHOW_BEST_STATE_PATH|SHOW_BEST_WORD_PATH|SHOW_BEST_PHONE_PATH)#if !defined(SHOW)#define SHOW SHOW_BEST_PATHS#endif/* UNIX/C stuff */#include <assert.h>#include <stdarg.h>#include <stdio.h>#include <stdlib.h>#include <assert.h>#include <string.h>#include <ctype.h>#include <stdarg.h>/* CMU Speech stuff */#include "s2types.h"#include "ckd_alloc.h"#include "basic_types.h"#include "strfuncs.h"#include "list.h"#include "search_const.h"#include "msd.h"#include "dict.h"#include "lmclass.h"#include "lm_3g.h"#include "lm.h"#include "kb.h"#include "phone.h"#include "log.h"#include "s2_semi_mgau.h"#include "senscr.h"#include "s2params.h"#include "fbs.h"#include "senscr.h"#include "search.h"#include "err.h"#include "uttproc.h"#include "cmd_ln.h"/* This module stuff */#include "time_align.h"#ifndef TRUE#define TRUE 1#define FALSE 0#endifint save_labs(SEGMENT_T * segs,              int num_entries,              const char *dirname,              const char *filename,              const char *extname, const char *labtype);static mfcc_t ***feat_f;static int frame_cnt = 0;static int *active_models[2];static int *cur_active_models;static int *boundary_active_models;static int *pruned_active_models;static int cur_active_cnt;static int next_active_cnt;static int cur_frame = 0;#define WORD_BP_TABLE_SIZE_INCREMENT	(1 * 1000)BACK_POINTER_T *word_bp_table = NULL;static int word_bp_table_next_free;static int word_bp_table_frame_start;static int max_word_bp_table_size = WORD_BP_TABLE_SIZE_INCREMENT;#define PHONE_BP_TABLE_SIZE_INCREMENT	(10 * 1000)BACK_POINTER_T *phone_bp_table = NULL;static int phone_bp_table_next_free;static int phone_bp_table_frame_start;static int max_phone_bp_table_size = PHONE_BP_TABLE_SIZE_INCREMENT;#define STATE_BP_TABLE_SIZE_INCREMENT	(NODE_CNT * 10 * 1000)BACK_POINTER_T *state_bp_table = NULL;static int state_bp_table_next_free;static int state_bp_table_frame_start;static int max_state_bp_table_size = STATE_BP_TABLE_SIZE_INCREMENT;static DYNMODEL_T *all_models = NULL;static int all_model_cnt;static int32 beam_width;static int32 n_word_segments = 0;static int32 n_phone_segments = 0;static int32 n_state_segments = 0;static char *best_word_string = NULL;static int best_word_string_len = 0;static int saved_final_model;static int32 sil_word_id;static int32 sil_phone_id;static int32 silb_phone_id;static int32 sile_phone_id;static int32 start_word_id;static int32 end_word_id;static const char *lcl_utt_id = NULL;static SEGMENT_T *wdseg = NULL;static SEGMENT_T *phseg = NULL;voidtime_align_set_utt(const char *id){    lcl_utt_id = id;}voidtime_align_set_beam_width(double bw){#if SHOW&SHOW_INVOKATION    E_INFO("time_align_set_beam_width(%e) called\n", bw);#endif    beam_width = 8 * LOG(bw);}intconstituent_cnt(char const *compound_word){    char *uscore;    int cnt;    char const *rem_word;    rem_word = compound_word;    cnt = 1;    uscore = strchr(rem_word, '_');    while ((uscore = strchr(uscore + 1, '_')))        cnt++;    ++cnt;    return cnt;}char *head_word_of(int k){    dict_entry_t *cmpnd_wrd = word_dict->dict_list[k];    static char head_word[1024];    char *uscore;    strcpy(head_word, cmpnd_wrd->word);    uscore = strchr(head_word, '_');    *uscore = '\0';    return head_word;}char *cvt_uscores_to_sp(char const *word){    char *wrk = ckd_salloc(word);    char *uscore;    uscore = wrk;    while ((uscore = strchr(uscore + 1, '_'))) {        *uscore = ' ';    }    return wrk;}intdescending_order_by_len(const void *a, const void *b){    const COMPOUND_WORD_T *a_wrd = a;    const COMPOUND_WORD_T *b_wrd = b;    /* sort into descending order */    if (a_wrd->word_cnt < b_wrd->word_cnt) {        return 1;               /* implies swap */    }    else if (a_wrd->word_cnt > b_wrd->word_cnt) {        return -1;    }    else        return 0;}COMPOUND_WORD_T *mk_compound_word_list(int *out_cnt){    int i, j;    dict_entry_t **dict_list = word_dict->dict_list;
time_align.c - 源码说明

本页面展示了「WinCE平台上的语音识别程序」中的 time_align.c 源码文件，采用 C语言编程语言编写，共 2,010 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与WinCE相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?