main_align.c
来自「CMU大名鼎鼎的SPHINX-3大词汇量连续语音识别系统」· C语言 代码 · 共 1,189 行 · 第 1/3 页
C
1,189 行
/* ==================================================================== * Copyright (c) 1995-2004 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * align-main.c -- Main driver routine for time alignment. * * ********************************************** * CMU ARPA Speech Project * * Copyright (c) 1996 Carnegie Mellon University. * ALL RIGHTS RESERVED. * ********************************************** * * HISTORY * * 19-Jun-1998 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Modified to handle the new libfeat interface. * * 11-Oct-1997 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University. * Added MLLR transformation for each utterance. * * 06-Mar-1997 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University. * Added .semi. and .cont. options to -senmgaufn flag. * * 16-Oct-1996 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added orig_stdout, orig_stderr hack to avoid hanging on exit under Linux. * * 14-Oct-1996 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Removed explicit addition of SILENCE_WORD, START_WORD and * FINISH_WORD to the dictionary. * * 18-Sep-1996 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added optional start/end frame specification in control file, for * processing selected segments (utterances) from a large cepfile. * Control spec: cepfile [startframe endframe [uttid]]. * (There are incompatibilities with ,CTL output directory specification.) * * 13-Sep-1996 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Normalized senone scores (subtracting the best) rather than density scores. * Bugfix: Absolute scores written to state score output file by removing * normalization factor. * * 22-Jul-1996 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added absolute (unnormalized) acoustic scores in log file. * Added Sphinx-II compatible output segmentation files. * * 15-Jul-1996 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Created. */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <assert.h>#include <s3types.h>#include "kb.h"#include "logs3.h"#include "tmat.h"#include "mdef.h"#include "dict.h"#include "agc.h"#include "cmn.h"#include "bio.h"#include "feat.h"/* ARCHAN: Dangerous routine :-)*/#include "s3_align.h"#include "ms_mllr.h"#include "ms_gauden.h"#include "ms_senone.h"#include "interp.h"/** \file main_align.c \brief Main driver routine for time alignment.*/static arg_t defn[] = { { "-logbase", ARG_FLOAT32, "1.0003", "Base in which all log values calculated" }, { "-mdef", ARG_STRING, NULL, "Model definition input file: triphone -> senones/tmat tying" }, { "-tmat", ARG_STRING, NULL, "Transition matrix input file" }, { "-mean", ARG_STRING, NULL, "Mixture gaussian codebooks mean parameters input file" }, { "-var", ARG_STRING, NULL, "Mixture gaussian codebooks variance parameters input file" }, { "-senmgau", ARG_STRING, ".cont.", "Senone to mixture-gaussian mapping file (or .semi. or .cont.)" }, { "-mixw", ARG_STRING, NULL, "Senone mixture weights parameters input file" }, { "-tpfloor", ARG_FLOAT32, "0.0001", "Triphone state transition probability floor applied to -tmat file" }, { "-varfloor", ARG_FLOAT32, "0.0001", "Codebook variance floor applied to -var file" }, { "-mwfloor", ARG_FLOAT32, "0.0000001", "Codebook mixture weight floor applied to -mixw file" }, { "-agc", ARG_STRING, "max", "AGC. max: C0 -= max(C0) in current utt; none: no AGC" }, { "-log3table", ARG_INT32, "1", "Determines whether to use the log3 table or to compute the values at run time."}, { "-cmn", ARG_STRING, "current", "Cepstral mean norm. current: C[1..n-1] -= mean(C[1..n-1]) in current utt; none: no CMN" }, { "-varnorm", ARG_STRING, "no", "Variance normalize each utterance (yes/no; only applicable if CMN is also performed)" }, { "-feat", /* Captures the computation for converting input to feature vector */ ARG_STRING, "1s_c_d_dd", "Feature stream: s2_4x / s3_1x39 / cep_dcep[,%d] / cep[,%d] / %d,%d,...,%d" }, { "-dict", ARG_STRING, NULL, "Main pronunciation dictionary (lexicon) input file" }, { "-fdict", ARG_STRING, NULL, "Optional filler word (noise word) pronunciation dictionary input file" }, { "-compwd", ARG_INT32, "0", "Compound words in dictionary (not supported yet)" }, { "-ctl", ARG_STRING, NULL, "Input control file listing utterances to be decoded" }, { "-ctloffset", ARG_INT32, "0", "No. of utterances at the beginning of -ctl file to be skipped" }, { "-ctlcount", ARG_INT32, 0, "No. of utterances in -ctl file to be processed (after -ctloffset). Default: Until EOF" }, { "-cepdir", ARG_STRING, ".", "Directory for utterances in -ctl file (if relative paths specified)." }, { "-cepext", ARG_STRING, ".mfc", "File extension appended to utterances listed in ctl file" }, { "-mllrctl", ARG_STRING, NULL, "Input control file listing MLLR input data; parallel to ctl argument file" }, { "-lambda", ARG_STRING, NULL, "Interpolation weights (CD/CI senone) parameters input file" }, { "-topn", ARG_INT32, "4", "No. of top scoring densities computed in each mixture gaussian codebook" }, { "-beam", ARG_FLOAT64, "1e-64", "Main pruning beam applied to triphones in forward search" }, { "-insent", ARG_STRING, NULL, "Input transcript file corresponding to control file" }, { "-outsent", ARG_STRING, NULL, "Output transcript file with exact pronunciation/transcription" }, { "-stsegdir", ARG_STRING, NULL, "Output directory for state segmentation files; optionally end with ,CTL" }, { "-phsegdir", ARG_STRING, NULL, "Output directory for phone segmentation files; optionally end with ,CTL" }, { "-wdsegdir", ARG_STRING, NULL, "Output directory for word segmentation files; optionally end with ,CTL" }, { "-s2stsegdir", ARG_STRING, NULL, "Output directory for Sphinx-II format state segmentation files; optionally end with ,CTL" }, { "-logfn", ARG_STRING, NULL, "Log file (default stdout/stderr)" }, { NULL, ARG_INT32, NULL, NULL }};static mdef_t *mdef; /* Model definition */static gauden_t *g; /* Gaussian density codebooks */static senone_t *sen; /* Senones */static interp_t *interp; /* CD/CI interpolation */static tmat_t *tmat; /* HMM transition matrices */static feat_t *fcb; /* Feature type descriptor (Feature Control Block) */static float32 ***feat = NULL; /* Speech feature data */static s3wid_t startwid, finishwid, silwid;static int32 *senscale; /* ALL senone scores scaled by senscale[i] in frame i */static FILE *outsentfp = NULL;/* For profiling/timing */enum { tmr_utt, tmr_gauden, tmr_senone, tmr_align };ptmr_t timers[5];static int32 tot_nfr;static ptmr_t tm_utt;/* * Load and cross-check all models (acoustic/lexical/linguistic). */static dict_t *dict;static void models_init ( void ){ float32 varfloor, mixwfloor, tpfloor; int32 i; char *arg; /* HMM model definition */ mdef = mdef_init ((char *) cmd_ln_access("-mdef")); /* Dictionary */ dict = dict_init (mdef, (char *) cmd_ln_access("-dict"), (char *) cmd_ln_access("-fdict"), '_'); /* Compound word separator. Default: none. */ /* HACK!! Make sure SILENCE_WORD, START_WORD and FINISH_WORD are in dictionary */ silwid = dict_wordid (dict, S3_SILENCE_WORD); startwid = dict_wordid (dict, S3_START_WORD); finishwid = dict_wordid (dict, S3_FINISH_WORD); if (NOT_S3WID(silwid) || NOT_S3WID(startwid) || NOT_S3WID(finishwid)) { E_FATAL("%s, %s, or %s missing from dictionary\n", S3_SILENCE_WORD, S3_START_WORD, S3_FINISH_WORD); } if ((dict->filler_start > dict->filler_end) || (! dict_filler_word (dict, silwid))) E_FATAL("%s must occur (only) in filler dictionary\n", S3_SILENCE_WORD); /* No check that alternative pronunciations for filler words are in filler range!! */ /* Codebooks */ varfloor = *((float32 *) cmd_ln_access("-varfloor")); g = gauden_init ((char *) cmd_ln_access("-mean"), (char *) cmd_ln_access("-var"), varfloor); /* Verify codebook feature dimensions against libfeat */ if (feat_n_stream(fcb) != g->n_feat) { E_FATAL("#feature mismatch: feat= %d, mean/var= %d\n", feat_n_stream(fcb), g->n_feat); } for (i = 0; i < feat_n_stream(fcb); i++) { if (feat_stream_len(fcb,i) != g->featlen[i]) { E_FATAL("featlen[%d] mismatch: feat= %d, mean/var= %d\n", i, feat_stream_len(fcb, i), g->featlen[i]); } } /* Senone mixture weights */ mixwfloor = *((float32 *) cmd_ln_access("-mwfloor")); sen = senone_init ((char *) cmd_ln_access("-mixw"), (char *) cmd_ln_access("-senmgau"), mixwfloor); /* Verify senone parameters against gauden parameters */ if (sen->n_feat != g->n_feat) E_FATAL("#Feature mismatch: gauden= %d, senone= %d\n", g->n_feat, sen->n_feat); if (sen->n_cw != g->n_density) E_FATAL("#Densities mismatch: gauden= %d, senone= %d\n", g->n_density, sen->n_cw); if (sen->n_gauden > g->n_mgau) E_FATAL("Senones need more codebooks (%d) than present (%d)\n", sen->n_gauden, g->n_mgau); if (sen->n_gauden < g->n_mgau) E_ERROR("Senones use fewer codebooks (%d) than present (%d)\n", sen->n_gauden, g->n_mgau); /* Verify senone parameters against model definition parameters */ if (mdef->n_sen != sen->n_sen) E_FATAL("Model definition has %d senones; but #senone= %d\n", mdef->n_sen, sen->n_sen); /* CD/CI senone interpolation weights file, if present */ if ((arg = (char *) cmd_ln_access ("-lambda")) != NULL) { interp = interp_init (arg); /* Verify interpolation weights size with senones */ if (interp->n_sen != sen->n_sen) E_FATAL("Interpolation file has %d weights; but #senone= %d\n", interp->n_sen, sen->n_sen); } else interp = NULL; /* Transition matrices */ tpfloor = *((float32 *) cmd_ln_access("-tpfloor")); tmat = tmat_init ((char *) cmd_ln_access("-tmat"), tpfloor); /* Verify transition matrices parameters against model definition parameters */ if (mdef->n_tmat != tmat->n_tmat) E_FATAL("Model definition has %d tmat; but #tmat= %d\n", mdef->n_tmat, tmat->n_tmat); if (mdef->n_emit_state != tmat->n_state) E_FATAL("#Emitting states in model definition = %d, #states in tmat = %d\n", mdef->n_emit_state, tmat->n_state); arg = (char *) cmd_ln_access ("-agc"); if ((strcmp (arg, "max") != 0) && (strcmp (arg, "none") != 0)) E_FATAL("Unknown -agc argument: %s\n", arg); arg = (char *) cmd_ln_access ("-cmn"); if ((strcmp (arg, "current") != 0) && (strcmp (arg, "none") != 0)) E_FATAL("Unknown -cmn argument: %s\n", arg);}/* * Build a filename int buf as follows (without file extension): * if dir ends with ,CTL and ctlspec does not begin with /, filename is dir/ctlspec * if dir ends with ,CTL and ctlspec DOES begin with /, filename is ctlspec * if dir does not end with ,CTL, filename is dir/uttid, * where ctlspec is the complete utterance spec in the input control file, and * uttid is the last component of ctlspec. */static void build_output_uttfile (char *buf, char *dir, char *uttid, char *ctlspec){ int32 k; k = strlen(dir); if ((k > 4) && (strcmp (dir+k-4, ",CTL") == 0)) { /* HACK!! Hardwired ,CTL */ if (ctlspec[0] != '/') {
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?