cont_ad_base.c
来自「CMU大名鼎鼎的SPHINX-3大词汇量连续语音识别系统」· C语言 代码 · 共 1,062 行 · 第 1/3 页
C
1,062 行
/* ==================================================================== * Copyright (c) 1999-2001 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * *//* * cont_ad.c -- Continuous A/D listening and silence filtering module. * * HISTORY * * 23-Oct-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Small change in the way the noiselevel is updated in find_thresh(). * * 26-Aug-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Separated computation of "frame power" into a separate low-level * function. * * 13-Jul-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Modified to allow frame size to depend on audio sampling rate. * * 01-Jul-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Changed CONT_AD_DELTA_SPEECH back to 20. * * 30-Jun-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Changed CONT_AD_DELTA_SPEECH from 10 to 15. * Added FILE* argument to cont_ad_powhist_dump(). * * 19-Jun-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Changed CONT_AD_DELTA_SPEECH from 20 to 10, to increase sensitivity * to very short utterances. * * 16-Jan-98 Paul Placeway (pwp@cs.cmu.edu) at Carnegie Mellon University * Changed to use dB instead of the weird power measure. * Changed analysis window size, tuned default settings of most * parameters to make the system less sensitive to noise, changed * the histogram update frequency and decay to make the system * adapt more rapidly to changes in the environment. * Added cont_ad_set_params() and cont_ad_get_params(). * * 28-Jul-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added FRMPOW2SIGLVL, max_siglvl(), and cont_ad_t.siglvl. * Changed min signal energy/frame to CONT_AD_SPF. * * 27-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added the option for cont_ad_read to return -1 on EOF. * * 21-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added cont_ad_set_thresh(). * Bugfix: n_other is recomputed after updating thresholds. * * 20-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Separated thresholds for speech and silence. * Fixed bug in moving analysis window upon transition to speech state. * * 17-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Created, based loosely on Steve Reed's original implementation. *//* * This module is intended to be interposed as a filter between any raw A/D source and the * application to remove silence regions. It is initialized with a raw A/D source function * (during the cont_ad_init call). Filtered A/D data can be read by the application using * the cont_ad_read function. This module assumes that the A/D source function supplies an * endless stream of data. The application is responsible for setting up the A/D source, * turning recording on and off as it desires. It is also responsible for invoking the * cont_ad_read function frequently enough to avoid buffer overruns and dropping A/D data. * This continuous listening module has an internal buffer of about 4 sec. * * This module must be initialized and calibrated at first (cont_ad_init and cont_ad_calib * functions). Raw samples are grouped into frames, the signal power in each frame is * computed and accumulated in a histogram. The module is always in one of two states: * SILENCE or SPEECH. Transitions between the two states are detected by looking for a * contiguous window of several frames that is predominantly of the other type. The type * is determined by comparing frame power to either of two thresholds, thresh_sil and * thresh_speech, as appropriate for the current state. These thresholds are set from the * first peak in the low-end of the power histogram, and are updated every few seconds. * Separate thresholds are used to provide some hysteresis. * * The module maintains a linked list of speech (non-silence) segments not yet read by the * application. The cont_ad_read function returns speech data, if any available, by * following this list. It also updates an "absolute" timestamp at the end of the * cont_ad_read operation. The timestamp indicates the total #samples of A/D data read * until this point, including data discarded as silence frames. The application is * responsible for using this timestamp to make any policy decisions regarding utterance * boundaries or whatever. */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <assert.h>#include <math.h>#include <s3types.h>#include "ad.h"#include "cont_ad.h"#include <err.h>#ifndef _ABS#define _ABS(x) ((x) >= 0 ? (x) : -(x))#endif/* States of continuous listening module */#define CONT_AD_STATE_SIL 0#define CONT_AD_STATE_SPEECH 1/* Various parameters, including defaults for many cont_ad_t member variables */#define CONT_AD_ADFRMSIZE 256 /* #Frames of internal A/D buffer maintained */#define CONT_AD_POWHISTSIZE 98 /* #Powhist bins: ~ FRMPOW(65536^2*CONT_AD_SPF) *//* Maximum level is 96.3 dB full-scale; 97 for safety, plus 1 for zero-based */#define CONT_AD_THRESH_UPDATE 100 /* Update thresholds approx every so many frames */ /* PWP: update was 200 frames, or 3.2 seconds. Now about every 1.6 sec. */#define CONT_AD_ADAPT_RATE 0.2 /* Interpolation of new and old noiselevel */#define CONT_AD_SPS 16000#define CONT_AD_DEFAULT_NOISE 30 /* Default background noise power level */#define CONT_AD_DELTA_SIL 5 /* Initial default for cont_ad_t.delta_sil */#define CONT_AD_DELTA_SPEECH 20 /* Initial default for cont_ad_t.delta_speech */#define CONT_AD_MIN_NOISE 2 /* Expected minimum background noise level */#define CONT_AD_MAX_NOISE 70 /* Maximum background noise level */#define CONT_AD_WINSIZE 21 /* Analysis window for state transitions */ /* rkm had 16 */#define CONT_AD_SPEECH_ONSET 9 /* Min #speech frames in analysis window for SILENCE -> SPEECH state transition *//* * SReed had 100 ms == 6.25 fr contiguous; rkm had 9 (out of 16+10) with a * lower threshold. */#define CONT_AD_SIL_ONSET 18 /* Min #silence frames in analysis window for SPEECH -> SILENCE state transition MUST BE <= CONT_AD_WINSIZE *//* * SReed had 400 ms == 25 fr contiguous; rkm had 14 out of 16 */#define CONT_AD_LEADER 5 /* On transition to SPEECH state, so many frames BEFORE window included in speech data (>0) */ /* SReed had 200 ms == 12.5 fr; rkm had 5 */#define CONT_AD_TRAILER 10 /* On transition to SILENCE state, so many frames of silence included in speech data (>0). NOTE: Ensure (0 < TRAILER+LEADER <= WINSIZE) */ /* SReed had 100 ms == 6.25 fr; rkm had 10 */#ifdef CONT_AD_RAWDUMPstatic FILE *rawfp;#endifstatic FILE *logfp = NULL; /* Detailed info written to fp if non-NULL */static int32 frmno = 0;void cont_ad_powhist_dump (FILE *fp, cont_ad_t *r){ int32 i; for (i = 0; i < CONT_AD_POWHISTSIZE; i++) if (r->pow_hist[i] > 0) fprintf (fp, "\t%3d %6d\n", i, r->pow_hist[i]); fprintf (fp, "\tnew noiselevel= %d thresh(sil,speech)= %d %d\n", r->noise_level, r->thresh_sil, r->thresh_speech); fflush (fp);}/* * Compute frame power. Interface deliberately kept low level to allow arbitrary * users to call this function with appropriate data. */int32 cont_ad_frame_pow (int16 *buf, int32 *prev, int32 spf){ double sumsq, v; int32 i; int32 p; sumsq = 0.0; p = *prev; for (i = 0; i < spf; i++) { v = (double) (buf[i] - p); sumsq += v*v; p = buf[i]; } *prev = p; if (sumsq < spf) /* Make sure FRMPOW(sumsq) >= 0 */ sumsq = spf; /* * PWP: Units changed to dB * * Now the units of measurement of an input sample are volts (really!), * so the power in dB is p = 20*log10(samp). Further, we want the RMS * (root-mean-squared) average power across the frame. * * "sumsq" is the sum of the sum of the squares, so we want * * i = 20 * log10( sqrt ( sumsq / n_samps) ) * * (Stephen Reed's code actually had * i = 20 * log10( sqrt (sumsq) / n_samps ) * but this only produced an additive error.) * * i = 20 * log10( sqrt ( sumsq / n_samps) ) * = 20 * log10( ( sumsq / n_samps) ^ 0.5 ) * = 20 * log10( ( sumsq / n_samps) ) * 0.5 ) * = 10 * log10( ( sumsq / n_samps) ) * = 10 * ( log10( sumsq) - log10(n_samps) ) */ i = (int32) ((10.0 * (log10(sumsq) - log10((double) spf))) + 0.5); if (i < 0) i = 0; /* trim lower bound again to be safe. */ assert (i < 97); return (i);}/* * Classify frame (id=frm, starting at sample position s) as sil/nonsil. Classification * done in isolation, independent of any other frame, based only on power histogram. */static void compute_frame_pow (cont_ad_t *r, int32 frm){ int32 i; i = cont_ad_frame_pow (r->adbuf + (frm * r->spf), &(r->prev_sample), r->spf); if (logfp) { fprintf (logfp, "%8.2f %2d\n", (double)(frmno * r->spf)/(double)(r->sps), i); fflush (logfp); frmno++; } r->frm_pow[frm] = (char) i; (r->pow_hist[i])++; r->thresh_update--;}void cont_ad_set_logfp (FILE *fp){ logfp = fp;}/* PWP: $$$ check this *//* * PWP: in SReed's code, decay was done by zeroing the histogram, * i.e. no history. */static void decay_hist (cont_ad_t *r){ int32 i; for (i = 0; i < CONT_AD_POWHISTSIZE; i++) r->pow_hist[i] >>= 1;}/* * Find silence threshold from power histogram. */static int32 find_thresh (cont_ad_t *r){ int32 i, j, max, th; if (!r->auto_thresh) return 0; /* * Find smallest non-zero histogram entry, but starting at some minimum power. * Power lower than CONT_AD_MIN_NOISE indicates bad A/D input (eg, mic off...). * Too high a minimum power is also bad. */ for (i = r->min_noise; (i < CONT_AD_POWHISTSIZE) && (r->pow_hist[i] == 0); i++); if (i > r->max_noise) /* Bad signal? */ return -1; /* PWP: Hmmmmm.... SReed's code looks over the lower 20 dB */ /* PWP: 1/14/98 Made to work like Stephen Reed's code */ max = 0; for (j = i, th = i; (j < CONT_AD_POWHISTSIZE) && (j < i+20); j++) { /* PWP: was i+6, which was 9 dB */ if (max < r->pow_hist[j]) { max = r->pow_hist[j]; th = j; } } if (logfp) { fprintf (logfp, "\tfrm= %d noiselevel= %d histthresh= %d\n", frmno, r->noise_level, th); } /* "Don't change the threshold too fast" */#if 0 if ( _ABS(r->noise_level - th) >= 10 ) { if (th > r->noise_level) r->noise_level += ((th - r->noise_level) / 2); else r->noise_level -= ((r->noise_level - th) / 2); } else { r->noise_level = th; }#else /* * RKM: The above is odd; if (diff >= 10) += diff/2; else += diff?? * This is discontinuous. Change to always += diff/2. */ r->noise_level = (int32) (th * r->adapt_rate + r->noise_level * (1.0 - r->adapt_rate));#endif /* update thresholds */ r->thresh_sil = r->noise_level + r->delta_sil; r->thresh_speech = r->noise_level + r->delta_speech; if (logfp) cont_ad_powhist_dump (logfp, r); /*
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?