main_align.c

来自「CMU大名鼎鼎的SPHINX-3大词汇量连续语音识别系统」· C语言 代码 · 共 1,189 行 · 第 1/3 页

C
1,189
字号
/* ==================================================================== * Copyright (c) 1995-2004 Carnegie Mellon University.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer.  * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * This work was supported in part by funding from the Defense Advanced  * Research Projects Agency and the National Science Foundation of the  * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * align-main.c -- Main driver routine for time alignment. *  * ********************************************** * CMU ARPA Speech Project * * Copyright (c) 1996 Carnegie Mellon University. * ALL RIGHTS RESERVED. * ********************************************** *  * HISTORY *  * 19-Jun-1998	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Modified to handle the new libfeat interface. *  * 11-Oct-1997	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University. * 		Added MLLR transformation for each utterance. *  * 06-Mar-1997	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University. * 		Added .semi. and .cont. options to -senmgaufn flag. *   * 16-Oct-1996	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Added orig_stdout, orig_stderr hack to avoid hanging on exit under Linux. *   * 14-Oct-1996	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Removed explicit addition of SILENCE_WORD, START_WORD and * 		FINISH_WORD to the dictionary. *  * 18-Sep-1996	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Added optional start/end frame specification in control file, for * 		processing selected segments (utterances) from a large cepfile. * 		Control spec: cepfile [startframe endframe [uttid]]. * 		(There are incompatibilities with ,CTL output directory specification.) *  * 13-Sep-1996	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Normalized senone scores (subtracting the best) rather than density scores. * 		Bugfix: Absolute scores written to state score output file by removing * 		normalization factor. *  * 22-Jul-1996	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Added absolute (unnormalized) acoustic scores in log file. * 		Added Sphinx-II compatible output segmentation files. *  * 15-Jul-1996	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Created. */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <assert.h>#include <s3types.h>#include "kb.h"#include "logs3.h"#include "tmat.h"#include "mdef.h"#include "dict.h"#include "agc.h"#include "cmn.h"#include "bio.h"#include "feat.h"/* ARCHAN: Dangerous routine :-)*/#include "s3_align.h"#include "ms_mllr.h"#include "ms_gauden.h"#include "ms_senone.h"#include "interp.h"/** \file main_align.c   \brief Main driver routine for time alignment.*/static arg_t defn[] = {    { "-logbase",      ARG_FLOAT32,      "1.0003",      "Base in which all log values calculated" },    { "-mdef",       ARG_STRING,      NULL,      "Model definition input file: triphone -> senones/tmat tying" },    { "-tmat",      ARG_STRING,      NULL,      "Transition matrix input file" },    { "-mean",      ARG_STRING,      NULL,      "Mixture gaussian codebooks mean parameters input file" },    { "-var",      ARG_STRING,      NULL,      "Mixture gaussian codebooks variance parameters input file" },    { "-senmgau",      ARG_STRING,      ".cont.",      "Senone to mixture-gaussian mapping file (or .semi. or .cont.)" },    { "-mixw",      ARG_STRING,      NULL,      "Senone mixture weights parameters input file" },    { "-tpfloor",      ARG_FLOAT32,      "0.0001",      "Triphone state transition probability floor applied to -tmat file" },    { "-varfloor",      ARG_FLOAT32,      "0.0001",      "Codebook variance floor applied to -var file" },    { "-mwfloor",      ARG_FLOAT32,      "0.0000001",      "Codebook mixture weight floor applied to -mixw file" },    { "-agc",      ARG_STRING,      "max",      "AGC.  max: C0 -= max(C0) in current utt; none: no AGC" },    { "-log3table",      ARG_INT32,      "1",      "Determines whether to use the log3 table or to compute the values at run time."},    { "-cmn",      ARG_STRING,      "current",      "Cepstral mean norm.  current: C[1..n-1] -= mean(C[1..n-1]) in current utt; none: no CMN" },    { "-varnorm",      ARG_STRING,      "no",      "Variance normalize each utterance (yes/no; only applicable if CMN is also performed)" },    { "-feat",	/* Captures the computation for converting input to feature vector */      ARG_STRING,      "1s_c_d_dd",      "Feature stream: s2_4x / s3_1x39 / cep_dcep[,%d] / cep[,%d] / %d,%d,...,%d" },    { "-dict",      ARG_STRING,      NULL,      "Main pronunciation dictionary (lexicon) input file" },    { "-fdict",      ARG_STRING,      NULL,      "Optional filler word (noise word) pronunciation dictionary input file" },    { "-compwd",      ARG_INT32,      "0",      "Compound words in dictionary (not supported yet)" },    { "-ctl",      ARG_STRING,      NULL,      "Input control file listing utterances to be decoded" },    { "-ctloffset",      ARG_INT32,      "0",      "No. of utterances at the beginning of -ctl file to be skipped" },    { "-ctlcount",      ARG_INT32,      0,      "No. of utterances in -ctl file to be processed (after -ctloffset).  Default: Until EOF" },    { "-cepdir",      ARG_STRING,      ".",      "Directory for utterances in -ctl file (if relative paths specified)." },    { "-cepext",      ARG_STRING,      ".mfc",      "File extension appended to utterances listed in ctl file" },    { "-mllrctl",      ARG_STRING,      NULL,      "Input control file listing MLLR input data; parallel to ctl argument file" },    { "-lambda",      ARG_STRING,      NULL,      "Interpolation weights (CD/CI senone) parameters input file" },    { "-topn",      ARG_INT32,      "4",      "No. of top scoring densities computed in each mixture gaussian codebook" },    { "-beam",      ARG_FLOAT64,      "1e-64",      "Main pruning beam applied to triphones in forward search" },    { "-insent",      ARG_STRING,      NULL,      "Input transcript file corresponding to control file" },    { "-outsent",      ARG_STRING,      NULL,      "Output transcript file with exact pronunciation/transcription" },    { "-stsegdir",      ARG_STRING,      NULL,      "Output directory for state segmentation files; optionally end with ,CTL" },    { "-phsegdir",      ARG_STRING,      NULL,      "Output directory for phone segmentation files; optionally end with ,CTL" },    { "-wdsegdir",      ARG_STRING,      NULL,      "Output directory for word segmentation files; optionally end with ,CTL" },    { "-s2stsegdir",      ARG_STRING,      NULL,      "Output directory for Sphinx-II format state segmentation files; optionally end with ,CTL" },    { "-logfn",      ARG_STRING,      NULL,      "Log file (default stdout/stderr)" },        { NULL, ARG_INT32, NULL, NULL }};static mdef_t *mdef;		/* Model definition */static gauden_t *g;		/* Gaussian density codebooks */static senone_t *sen;		/* Senones */static interp_t *interp;	/* CD/CI interpolation */static tmat_t *tmat;		/* HMM transition matrices */static feat_t *fcb;		/* Feature type descriptor (Feature Control Block) */static float32 ***feat = NULL;	/* Speech feature data */static s3wid_t startwid, finishwid, silwid;static int32 *senscale;		/* ALL senone scores scaled by senscale[i] in frame i */static FILE *outsentfp = NULL;/* For profiling/timing */enum { tmr_utt, tmr_gauden, tmr_senone, tmr_align };ptmr_t timers[5];static int32 tot_nfr;static ptmr_t tm_utt;/* * Load and cross-check all models (acoustic/lexical/linguistic). */static dict_t *dict;static void models_init ( void ){    float32 varfloor, mixwfloor, tpfloor;    int32 i;    char *arg;        /* HMM model definition */    mdef = mdef_init ((char *) cmd_ln_access("-mdef"));    /* Dictionary */    dict = dict_init (mdef,		      (char *) cmd_ln_access("-dict"),		      (char *) cmd_ln_access("-fdict"),		      '_');	/* Compound word separator.  Default: none. */    /* HACK!! Make sure SILENCE_WORD, START_WORD and FINISH_WORD are in dictionary */    silwid = dict_wordid (dict, S3_SILENCE_WORD);    startwid = dict_wordid (dict, S3_START_WORD);    finishwid = dict_wordid (dict, S3_FINISH_WORD);    if (NOT_S3WID(silwid) || NOT_S3WID(startwid) || NOT_S3WID(finishwid)) {	E_FATAL("%s, %s, or %s missing from dictionary\n",		S3_SILENCE_WORD, S3_START_WORD, S3_FINISH_WORD);    }    if ((dict->filler_start > dict->filler_end) || (! dict_filler_word (dict, silwid)))	E_FATAL("%s must occur (only) in filler dictionary\n", S3_SILENCE_WORD);    /* No check that alternative pronunciations for filler words are in filler range!! */    /* Codebooks */    varfloor = *((float32 *) cmd_ln_access("-varfloor"));    g = gauden_init ((char *) cmd_ln_access("-mean"),		     (char *) cmd_ln_access("-var"),		     varfloor);    /* Verify codebook feature dimensions against libfeat */    if (feat_n_stream(fcb) != g->n_feat) {	E_FATAL("#feature mismatch: feat= %d, mean/var= %d\n",		feat_n_stream(fcb), g->n_feat);    }    for (i = 0; i < feat_n_stream(fcb); i++) {	if (feat_stream_len(fcb,i) != g->featlen[i]) {	    E_FATAL("featlen[%d] mismatch: feat= %d, mean/var= %d\n", i,		    feat_stream_len(fcb, i), g->featlen[i]);	}    }        /* Senone mixture weights */    mixwfloor = *((float32 *) cmd_ln_access("-mwfloor"));    sen = senone_init ((char *) cmd_ln_access("-mixw"),		       (char *) cmd_ln_access("-senmgau"),		       mixwfloor);        /* Verify senone parameters against gauden parameters */    if (sen->n_feat != g->n_feat)	E_FATAL("#Feature mismatch: gauden= %d, senone= %d\n", g->n_feat, sen->n_feat);    if (sen->n_cw != g->n_density)	E_FATAL("#Densities mismatch: gauden= %d, senone= %d\n", g->n_density, sen->n_cw);    if (sen->n_gauden > g->n_mgau)	E_FATAL("Senones need more codebooks (%d) than present (%d)\n",		sen->n_gauden, g->n_mgau);    if (sen->n_gauden < g->n_mgau)	E_ERROR("Senones use fewer codebooks (%d) than present (%d)\n",		sen->n_gauden, g->n_mgau);    /* Verify senone parameters against model definition parameters */    if (mdef->n_sen != sen->n_sen)	E_FATAL("Model definition has %d senones; but #senone= %d\n",		mdef->n_sen, sen->n_sen);    /* CD/CI senone interpolation weights file, if present */    if ((arg = (char *) cmd_ln_access ("-lambda")) != NULL) {	interp = interp_init (arg);	/* Verify interpolation weights size with senones */	if (interp->n_sen != sen->n_sen)	    E_FATAL("Interpolation file has %d weights; but #senone= %d\n",		    interp->n_sen, sen->n_sen);    } else	interp = NULL;    /* Transition matrices */    tpfloor = *((float32 *) cmd_ln_access("-tpfloor"));    tmat = tmat_init ((char *) cmd_ln_access("-tmat"), tpfloor);    /* Verify transition matrices parameters against model definition parameters */    if (mdef->n_tmat != tmat->n_tmat)	E_FATAL("Model definition has %d tmat; but #tmat= %d\n",		mdef->n_tmat, tmat->n_tmat);    if (mdef->n_emit_state != tmat->n_state)	E_FATAL("#Emitting states in model definition = %d, #states in tmat = %d\n",		mdef->n_emit_state, tmat->n_state);    arg = (char *) cmd_ln_access ("-agc");    if ((strcmp (arg, "max") != 0) && (strcmp (arg, "none") != 0))	E_FATAL("Unknown -agc argument: %s\n", arg);    arg = (char *) cmd_ln_access ("-cmn");    if ((strcmp (arg, "current") != 0) && (strcmp (arg, "none") != 0))	E_FATAL("Unknown -cmn argument: %s\n", arg);}/* * Build a filename int buf as follows (without file extension): *     if dir ends with ,CTL and ctlspec does not begin with /, filename is dir/ctlspec *     if dir ends with ,CTL and ctlspec DOES begin with /, filename is ctlspec *     if dir does not end with ,CTL, filename is dir/uttid, * where ctlspec is the complete utterance spec in the input control file, and * uttid is the last component of ctlspec. */static void build_output_uttfile (char *buf, char *dir, char *uttid, char *ctlspec){    int32 k;        k = strlen(dir);    if ((k > 4) && (strcmp (dir+k-4, ",CTL") == 0)) {	/* HACK!! Hardwired ,CTL */	if (ctlspec[0] != '/') {

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?