cont_ad_base.c

来自「CMU大名鼎鼎的SPHINX-3大词汇量连续语音识别系统」· C语言 代码 · 共 1,062 行 · 第 1/3 页

C
1,062
字号
/* ==================================================================== * Copyright (c) 1999-2001 Carnegie Mellon University.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer.  * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * This work was supported in part by funding from the Defense Advanced  * Research Projects Agency and the National Science Foundation of the  * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * cont_ad.c -- Continuous A/D listening and silence filtering module. *  * HISTORY *  * 23-Oct-98	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Small change in the way the noiselevel is updated in find_thresh(). *  * 26-Aug-98	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Separated computation of "frame power" into a separate low-level * 		function. *  * 13-Jul-98	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Modified to allow frame size to depend on audio sampling rate. *  * 01-Jul-98	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Changed CONT_AD_DELTA_SPEECH back to 20. *  * 30-Jun-98	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Changed CONT_AD_DELTA_SPEECH from 10 to 15. * 		Added FILE* argument to cont_ad_powhist_dump(). *  * 19-Jun-98	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Changed CONT_AD_DELTA_SPEECH from 20 to 10, to increase sensitivity * 		to very short utterances. *  * 16-Jan-98	Paul Placeway (pwp@cs.cmu.edu) at Carnegie Mellon University * 		Changed to use dB instead of the weird power measure. * 		Changed analysis window size, tuned default settings of most * 		parameters to make the system less sensitive to noise, changed * 		the histogram update frequency and decay to make the system * 		adapt more rapidly to changes in the environment. * 		Added cont_ad_set_params() and cont_ad_get_params(). *  * 28-Jul-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Added FRMPOW2SIGLVL, max_siglvl(), and cont_ad_t.siglvl. * 		Changed min signal energy/frame to CONT_AD_SPF. *  * 27-Jun-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Added the option for cont_ad_read to return -1 on EOF. *  * 21-Jun-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Added cont_ad_set_thresh(). * 		Bugfix: n_other is recomputed after updating thresholds. *  * 20-Jun-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Separated thresholds for speech and silence. * 		Fixed bug in moving analysis window upon transition to speech state. *  * 17-Jun-96	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * 		Created, based loosely on Steve Reed's original implementation. *//* * This module is intended to be interposed as a filter between any raw A/D source and the * application to remove silence regions.  It is initialized with a raw A/D source function * (during the cont_ad_init call).  Filtered A/D data can be read by the application using * the cont_ad_read function.  This module assumes that the A/D source function supplies an * endless stream of data.  The application is responsible for setting up the A/D source, * turning recording on and off as it desires.  It is also responsible for invoking the * cont_ad_read function frequently enough to avoid buffer overruns and dropping A/D data. * This continuous listening module has an internal buffer of about 4 sec. *  * This module must be initialized and calibrated at first (cont_ad_init and cont_ad_calib * functions).  Raw samples are grouped into frames, the signal power in each frame is * computed and accumulated in a histogram.  The module is always in one of two states: * SILENCE or SPEECH.  Transitions between the two states are detected by looking for a * contiguous window of several frames that is predominantly of the other type.  The type * is determined by comparing frame power to either of two thresholds, thresh_sil and * thresh_speech, as appropriate for the current state.  These thresholds are set from the * first peak in the low-end of the power histogram, and are updated every few seconds. * Separate thresholds are used to provide some hysteresis. *  * The module maintains a linked list of speech (non-silence) segments not yet read by the * application.  The cont_ad_read function returns speech data, if any available, by * following this list.  It also updates an "absolute" timestamp at the end of the * cont_ad_read operation.  The timestamp indicates the total #samples of A/D data read * until this point, including data discarded as silence frames.  The application is * responsible for using this timestamp to make any policy decisions regarding utterance * boundaries or whatever. */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <assert.h>#include <math.h>#include <s3types.h>#include "ad.h"#include "cont_ad.h"#include <err.h>#ifndef _ABS#define _ABS(x) ((x) >= 0 ? (x) : -(x))#endif/* States of continuous listening module */#define CONT_AD_STATE_SIL	0#define CONT_AD_STATE_SPEECH	1/* Various parameters, including defaults for many cont_ad_t member variables */#define CONT_AD_ADFRMSIZE	256	/* #Frames of internal A/D buffer maintained */#define CONT_AD_POWHISTSIZE	98	/* #Powhist bins: ~ FRMPOW(65536^2*CONT_AD_SPF) *//* Maximum level is 96.3 dB full-scale; 97 for safety, plus 1 for zero-based */#define CONT_AD_THRESH_UPDATE	100	/* Update thresholds approx every so many frames */	/* PWP: update was 200 frames, or 3.2 seconds.  Now about every 1.6 sec. */#define CONT_AD_ADAPT_RATE	0.2	/* Interpolation of new and old noiselevel */#define CONT_AD_SPS             16000#define CONT_AD_DEFAULT_NOISE	30	/* Default background noise power level */#define CONT_AD_DELTA_SIL	5	/* Initial default for cont_ad_t.delta_sil */#define CONT_AD_DELTA_SPEECH	20	/* Initial default for cont_ad_t.delta_speech */#define CONT_AD_MIN_NOISE	2	/* Expected minimum background noise level */#define CONT_AD_MAX_NOISE	70	/* Maximum background noise level */#define CONT_AD_WINSIZE		21	/* Analysis window for state transitions */				/* rkm had 16 */#define CONT_AD_SPEECH_ONSET	9	/* Min #speech frames in analysis window for					   SILENCE -> SPEECH state transition *//* * SReed had 100 ms == 6.25 fr contiguous; rkm had 9 (out of 16+10) with a * lower threshold. */#define CONT_AD_SIL_ONSET	18	/* Min #silence frames in analysis window for					   SPEECH -> SILENCE state transition					   MUST BE <= CONT_AD_WINSIZE *//* * SReed had 400 ms == 25 fr contiguous; rkm had 14 out of 16 */#define CONT_AD_LEADER		5	/* On transition to SPEECH state, so many frames					   BEFORE window included in speech data (>0) */				/* SReed had 200 ms == 12.5 fr; rkm had 5 */#define CONT_AD_TRAILER		10	/* On transition to SILENCE state, so many frames					   of silence included in speech data (>0).					   NOTE: Ensure (0 < TRAILER+LEADER <= WINSIZE) */				/* SReed had 100 ms == 6.25 fr; rkm had 10 */#ifdef CONT_AD_RAWDUMPstatic FILE *rawfp;#endifstatic FILE *logfp = NULL;	/* Detailed info written to fp if non-NULL */static int32 frmno = 0;void cont_ad_powhist_dump (FILE *fp, cont_ad_t *r){    int32 i;    for (i = 0; i < CONT_AD_POWHISTSIZE; i++)	if (r->pow_hist[i] > 0)	    fprintf (fp, "\t%3d %6d\n", i, r->pow_hist[i]);    fprintf (fp, "\tnew noiselevel= %d  thresh(sil,speech)= %d %d\n",	     r->noise_level, r->thresh_sil, r->thresh_speech);    fflush (fp);}/* * Compute frame power.  Interface deliberately kept low level to allow arbitrary * users to call this function with appropriate data. */int32 cont_ad_frame_pow (int16 *buf, int32 *prev, int32 spf){    double sumsq, v;    int32 i;    int32 p;        sumsq = 0.0;    p = *prev;    for (i = 0; i < spf; i++) {	v = (double) (buf[i] - p);	sumsq += v*v;	p = buf[i];    }    *prev = p;        if (sumsq < spf)	/* Make sure FRMPOW(sumsq) >= 0 */	sumsq = spf;    /*     * PWP: Units changed to dB     *     * Now the units of measurement of an input sample are volts (really!),     * so the power in dB is p = 20*log10(samp).  Further, we want the RMS     * (root-mean-squared) average power across the frame.     *     * "sumsq" is the sum of the sum of the squares, so we want     *     *   i = 20 * log10( sqrt ( sumsq / n_samps) )     *     * (Stephen Reed's code actually had      *    i = 20 * log10( sqrt (sumsq) / n_samps )     *  but this only produced an additive error.)     *     * i = 20 * log10( sqrt ( sumsq / n_samps) )     *   = 20 * log10( ( sumsq / n_samps) ^ 0.5 )     *   = 20 * log10( ( sumsq / n_samps) ) * 0.5 )     *   = 10 * log10( ( sumsq / n_samps) )     *   = 10 * ( log10( sumsq) - log10(n_samps) )     */    i = (int32) ((10.0 * (log10(sumsq) - log10((double) spf))) + 0.5);    if (i < 0) i = 0;		/* trim lower bound again to be safe. */    assert (i < 97);    return (i);}/* * Classify frame (id=frm, starting at sample position s) as sil/nonsil.  Classification * done in isolation, independent of any other frame, based only on power histogram. */static void compute_frame_pow (cont_ad_t *r, int32 frm){    int32 i;        i = cont_ad_frame_pow (r->adbuf + (frm * r->spf), &(r->prev_sample), r->spf);    if (logfp) {        fprintf (logfp, "%8.2f %2d\n",		 (double)(frmno * r->spf)/(double)(r->sps), i);	fflush (logfp);	frmno++;    }    r->frm_pow[frm] = (char) i;    (r->pow_hist[i])++;    r->thresh_update--;}void cont_ad_set_logfp (FILE *fp){    logfp = fp;}/* PWP: $$$ check this *//* * PWP: in SReed's code, decay was done by zeroing the histogram, * i.e. no history. */static void decay_hist (cont_ad_t *r){    int32 i;        for (i = 0; i < CONT_AD_POWHISTSIZE; i++)	r->pow_hist[i] >>= 1;}/* * Find silence threshold from power histogram. */static int32 find_thresh (cont_ad_t *r){    int32 i, j, max, th;    if (!r->auto_thresh)      return 0;    /*     * Find smallest non-zero histogram entry, but starting at some minimum power.     * Power lower than CONT_AD_MIN_NOISE indicates bad A/D input (eg, mic off...).     * Too high a minimum power is also bad.     */    for (i = r->min_noise; (i < CONT_AD_POWHISTSIZE) && (r->pow_hist[i] == 0); i++);    if (i > r->max_noise)	/* Bad signal? */	return -1;    /* PWP: Hmmmmm.... SReed's code looks over the lower 20 dB */    /* PWP: 1/14/98  Made to work like Stephen Reed's code */    max = 0;    for (j = i, th = i;	 (j < CONT_AD_POWHISTSIZE) && (j < i+20); j++) { /* PWP: was i+6, which was 9 dB */	if (max < r->pow_hist[j]) {	    max = r->pow_hist[j];	    th = j;	}    }    if (logfp) {	fprintf (logfp, "\tfrm= %d  noiselevel= %d  histthresh= %d\n",		 frmno, r->noise_level, th);    }        /* "Don't change the threshold too fast" */#if 0    if ( _ABS(r->noise_level - th) >= 10 ) {	if (th > r->noise_level)	    r->noise_level += ((th - r->noise_level) / 2);	else	    r->noise_level -= ((r->noise_level - th) / 2);    } else {	r->noise_level = th;    }#else    /*     * RKM: The above is odd; if (diff >= 10) += diff/2; else += diff??     * This is discontinuous.  Change to always += diff/2.     */    r->noise_level = (int32) (th * r->adapt_rate + r->noise_level * (1.0 - r->adapt_rate));#endif    /* update thresholds */    r->thresh_sil = r->noise_level + r->delta_sil;    r->thresh_speech = r->noise_level + r->delta_speech;    if (logfp)	cont_ad_powhist_dump (logfp, r);        /*

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?