📄 time_align.c
字号:
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- *//* ==================================================================== * Copyright (c) 1999-2004 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * time_align.c * * Description: * These routines will time align a given word string to a given acoustic input stream. * The steps involved in this process are: * - The given word string is converted into a triphone model DAG. * - The viterbi algorithm is applied to find the best state sequence through the DAG * - State, phone and word level backpointers are maintained in order to retrieve * the best state, phone and word level segmentations of the acoustic input stream. * * Optional silence is allowed between words. * * To Do: * - allow for arbitrary left and right context begin and end phones. This would allow * these routines to do a detailed phonetic analysis in an error region for instance. * - allow optional filler word sequence between words (e.g. ++COUGH++ ++SNIFF++ SIL ++SNIFF++). * * Revision History * * 22-Nov-2004 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Modified to use senscr module for senone score evaluation. * * Revision 1.15 2004/11/13 00:38:43 egouvea * Replaced most printf with E_INFO (or E_WARN or...). Changed the output * of the time_align code so it's consistent with the other decoder modes * (allphone, normal decoding etc). Added the file utt id to the * time_align output. * * Revision 1.14 2004/07/16 00:57:11 egouvea * Added Ravi's implementation of FSG support. * * Revision 1.2 2004/05/27 14:22:57 rkm * FSG cross-word triphones completed (but for single-phone words) * * Revision 1.13 2001/12/11 00:24:48 lenzo * Acknowledgement in License. * * Revision 1.12 2001/12/07 17:30:02 lenzo * Clean up and remove extra lines. * * Revision 1.11 2001/12/07 05:09:30 lenzo * License.xsxc * * Revision 1.10 2001/12/07 04:27:35 lenzo * License cleanup. Remove conditions on the names. Rationale: These * conditions don't belong in the license itself, but in other fora that * offer protection for recognizeable names such as "Carnegie Mellon * University" and "Sphinx." These changes also reduce interoperability * issues with other licenses such as the Mozilla Public License and the * GPL. This update changes the top-level license files and removes the * old license conditions from each of the files that contained it. * All files in this collection fall under the copyright of the top-level * LICENSE file. * * Revision 1.9 2001/12/07 00:51:49 lenzo * Quiet warnings. * * Revision 1.8 2001/10/23 22:20:30 lenzo * Change error logging and reporting to the E_* macros that call common * functions. This will obsolete logmsg.[ch] and they will be removed * or changed in future versions. * * Revision 1.7 2001/07/02 16:47:12 lenzo * Fixed triphone lookup fallback case. * * Revision 1.6 2001/02/13 19:51:38 lenzo * *** empty log message *** * * Revision 1.5 2001/01/25 19:36:29 lenzo * Fixing some memory leaks * * Revision 1.4 2000/12/12 23:01:42 lenzo * Rationalizing libs and names some more. Split a/d and fe libs out. * * Revision 1.3 2000/12/05 01:45:12 lenzo * Restructuring, hear rationalization, warning removal, ANSIfy * * Revision 1.2 2000/03/29 14:30:28 awb * *** empty log message *** * * Revision 1.1.1.1 2000/01/28 22:08:57 lenzo * Initial import of sphinx2 * * * * 02-Jan-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added time_align_word flag to determine whether word segmentation is * output. Implemented printing of word and phone segmentations. * * 02-Jan-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added time_align_phone and time_align_state flags for determining at run * time whether phone-level and state-level backtrace are printed. * * Revision 1.4 1995/01/27 18:04:18 eht * Fixed a print statement bug * * Revision 1.3 1994/10/13 11:33:12 eht * Fixed handling of single phone words. * * Revision 1.2 1994/09/26 15:58:12 eht * Have time_align control allocation/deallocation of buffers. * Output revision control ID at initialization time * * Revision 1.1 1994/09/26 13:24:45 eht * Initial revision * * Public Interface: * * time_align_init() * Initializes the system. Should be called once before any of the following * calls are made. * * time_align_set_input() * Provide the speech input for time alignment. A call to this must precede * any of the following calls. * * time_align_word_sequence(char *left_word, char *word_seq, char *right_word) * Performs the forced recognition against the input. A call to this routine * must precede the following calls. * * char *time_align_best_word_string() * Returns the best word string associated with the best state sequence found * by time_align_word_sequence(). Included in this string are filler * words (sil, silb, sile, noise words). * * BEWARE: * The caller must NOT free the returned string. * * time_align_seg_output(unsigned short **seg, int *seg_cnt) * Returns the state time alignments in the form of a sequence of shorts * where: * high bit set indicates the first frame of a phone * * x = ci_phone_id * 5 + state_id, where state_id is either * 0, 1, 2, 3, 4 and ci_phone_id is the context independent * from the phone file given to the system. * * These data are used by the senone decision tree builder. * * BEWARE: * The caller must NOT free the returned seg array. * * SEGMENT_T *time_align_get_segmentation(int kind, int *seg_cnt) */#define SHOW_NOTHING (0x0000)#define SHOW_EVERYTHING (0xffff)#define SHOW_INVOKATION (0x0001) /* print function call invokation info */#define SHOW_MODEL_EVAL (0x0002) /* print model evaluation trace */#define SHOW_SUMMARY_INFO (0x0004) /* print search summary info */#define SHOW_ACTIVE (0x0008) /* print the active model state before/after evaluation scores */#define SHOW_BP (0x0010) /* show any new backpointer info for each frame */#define SHOW_FORCED_MODEL (0x0020) /* show the topology of the model to be used for forced recognition */#define SHOW_BEST_WORD_PATH (0x0040) /* print the best scoring word alignment */#define SHOW_PHONE_GRAPH (0x0080) /* print the phone graph to be searched */#define SHOW_PRUNING (0x0100) /* print the models pruned per frame */#define SHOW_MODEL_DAG (0x0200) /* print the model dag used */#define SHOW_NODE_EXPANSION (0x0400) /* print ci -> triphone expansion information */#define SHOW_SYS_INFO (0x0800) /* print memory usage/ system performance figures */#define SHOW_BEST_PHONE_PATH (0x1000) /* print the best scoring phone alignment */#define SHOW_BEST_STATE_PATH (0x2000) /* print the best scoring state alignment */#define SHOW_BEST_PATHS (SHOW_BEST_STATE_PATH|SHOW_BEST_WORD_PATH|SHOW_BEST_PHONE_PATH)#if !defined(SHOW)#define SHOW SHOW_BEST_PATHS#endif/* UNIX/C stuff */#include <assert.h>#include <stdarg.h>#include <stdio.h>#include <stdlib.h>#include <assert.h>#include <string.h>#include <ctype.h>#include <stdarg.h>/* CMU Speech stuff */#include "s2types.h"#include "ckd_alloc.h"#include "basic_types.h"#include "strfuncs.h"#include "list.h"#include "search_const.h"#include "msd.h"#include "dict.h"#include "lmclass.h"#include "lm_3g.h"#include "lm.h"#include "kb.h"#include "phone.h"#include "log.h"#include "s2_semi_mgau.h"#include "senscr.h"#include "s2params.h"#include "fbs.h"#include "senscr.h"#include "search.h"#include "err.h"#include "uttproc.h"#include "cmd_ln.h"/* This module stuff */#include "time_align.h"#ifndef TRUE#define TRUE 1#define FALSE 0#endifint save_labs(SEGMENT_T * segs, int num_entries, const char *dirname, const char *filename, const char *extname, const char *labtype);static mfcc_t ***feat_f;static int frame_cnt = 0;static int *active_models[2];static int *cur_active_models;static int *boundary_active_models;static int *pruned_active_models;static int cur_active_cnt;static int next_active_cnt;static int cur_frame = 0;#define WORD_BP_TABLE_SIZE_INCREMENT (1 * 1000)BACK_POINTER_T *word_bp_table = NULL;static int word_bp_table_next_free;static int word_bp_table_frame_start;static int max_word_bp_table_size = WORD_BP_TABLE_SIZE_INCREMENT;#define PHONE_BP_TABLE_SIZE_INCREMENT (10 * 1000)BACK_POINTER_T *phone_bp_table = NULL;static int phone_bp_table_next_free;static int phone_bp_table_frame_start;static int max_phone_bp_table_size = PHONE_BP_TABLE_SIZE_INCREMENT;#define STATE_BP_TABLE_SIZE_INCREMENT (NODE_CNT * 10 * 1000)BACK_POINTER_T *state_bp_table = NULL;static int state_bp_table_next_free;static int state_bp_table_frame_start;static int max_state_bp_table_size = STATE_BP_TABLE_SIZE_INCREMENT;static DYNMODEL_T *all_models = NULL;static int all_model_cnt;static int32 beam_width;static int32 n_word_segments = 0;static int32 n_phone_segments = 0;static int32 n_state_segments = 0;static char *best_word_string = NULL;static int best_word_string_len = 0;static int saved_final_model;static int32 sil_word_id;static int32 sil_phone_id;static int32 silb_phone_id;static int32 sile_phone_id;static int32 start_word_id;static int32 end_word_id;static const char *lcl_utt_id = NULL;static SEGMENT_T *wdseg = NULL;static SEGMENT_T *phseg = NULL;voidtime_align_set_utt(const char *id){ lcl_utt_id = id;}voidtime_align_set_beam_width(double bw){#if SHOW&SHOW_INVOKATION E_INFO("time_align_set_beam_width(%e) called\n", bw);#endif beam_width = 8 * LOG(bw);}intconstituent_cnt(char const *compound_word){ char *uscore; int cnt; char const *rem_word; rem_word = compound_word; cnt = 1; uscore = strchr(rem_word, '_'); while ((uscore = strchr(uscore + 1, '_'))) cnt++; ++cnt; return cnt;}char *head_word_of(int k){ dict_entry_t *cmpnd_wrd = word_dict->dict_list[k]; static char head_word[1024]; char *uscore; strcpy(head_word, cmpnd_wrd->word); uscore = strchr(head_word, '_'); *uscore = '\0'; return head_word;}char *cvt_uscores_to_sp(char const *word){ char *wrk = ckd_salloc(word); char *uscore; uscore = wrk; while ((uscore = strchr(uscore + 1, '_'))) { *uscore = ' '; } return wrk;}intdescending_order_by_len(const void *a, const void *b){ const COMPOUND_WORD_T *a_wrd = a; const COMPOUND_WORD_T *b_wrd = b; /* sort into descending order */ if (a_wrd->word_cnt < b_wrd->word_cnt) { return 1; /* implies swap */ } else if (a_wrd->word_cnt > b_wrd->word_cnt) { return -1; } else return 0;}COMPOUND_WORD_T *mk_compound_word_list(int *out_cnt){ int i, j; dict_entry_t **dict_list = word_dict->dict_list;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -