endpoint.cpp

来自「一个语音信号端点检测的程序」· C++ 代码 · 共 462 行
CPP
462 行
//// ENDPOINT.CC - The endpoint class member routines.//// Bruce T. Lowerre, Public domain, 1995, 1997//// $Log: endpoint.cc,v $// Revision 1.3  1997/07/31 18:32:13  lowerre// fixed bug with EP_INUTT//// Revision 1.2  1997/05/23 20:01:32  lowerre// renamed endpoint to endpointer, conflicts with <rpcsvc/nis.h>//// Revision 1.1  1997/05/14 20:34:38  lowerre// Initial revision///////* * The endpointer is used to determine the start and end of a live input signal. * Unlike a pre-recorded utterance, a live input signal is open-ended in that the * actual start and end of the signal is totally unknown.  The search will usually * do a fairly good job of guessing the start of the signal.  However, the actual * end of the signal is unknown to the recognizer.  Reaching the end state in the * recognizer does not necessarily mean the end of signal.  Therefore, the end of * signal must be calculated by some means.  This is the job of the end point * detector.  This module is accessed via a class structure.  It should be called * for each frame of data to determine what processing should be done. * * The endpointer uses "cheap" signal processing features (energy and zero cross count) *  and is intended to run constantly on a host processor without the need of a DSP * or high speed processor.  When the start of the utterance is detected, then the * expensive search can be called. * * The endpointer is designed to run with a real-time processing search.  That means * that the live input signal is processed in real-time while it's being read. * Therefore, the start of signal will occur (and the search will start) before the * entire utterance has been read.  The ramifications of this is that the endpointer * has to guess as to the possible start and end of utterance.  These guesses, frame * labels, are used by other modules to guide the utterance capture and search.  The * endpointer may realize that it has mis-labeled either the start of utterance or the * end of utterance.  When this happens, a special frame label (either EP_RESET * if a false start was detected or EP_NOTEND if a false end was detected) is returned. * * The algorithms used in this module have evolved from 20 years of work with live input * signals. */#include "endpoint.h"/* * ENDPOINTER::ENDPOINTER - class constructor, set initial values */endpointer::endpointer(    long	d_samprate,			// sampling rate in Hz    long	d_windowsize,			// windowsize in samples    long	d_stepsize,			// step size in samples    long	d_maxipause,			// default ending silence in msec    long	d_minuttlng,			// default minuttlng in msec    long	d_zcthresh,			// default zcthresh, Hz    float	d_begfact,			// default begfact    float	d_endfact,			// default endfact    float	d_energyfact,			// default energyfact    float	d_minstartsilence,		// default minstartsilence    float	d_triggerfact,			// default triggerfact    long	d_numdpnoise,			// default numdpnoise    long	d_minfriclng,			// default minfriclng in msec    long	d_maxpause,			// default maxpause in msec    long	d_startblip,			// default startblip in msec    long	d_endblip,			// default endblip in msec    long	d_minvoicelng,			// default minvoicelng in msec    long	d_minrise			// default minrise in msec){    long	i;    samprate = d_samprate;    windowsize = d_windowsize;    stepsize = d_stepsize;    maxipause = (d_maxipause * samprate) / (1000 * stepsize); // number of steps    minuttlng = (d_minuttlng * samprate) / (1000 * stepsize); // number of steps    zcthresh = (d_zcthresh * stepsize) / samprate; // per frame    begfact = d_begfact;    endfact = d_endfact;    energyfact = d_energyfact;    minstartsilence = d_minstartsilence;    numdpnoise = d_numdpnoise;    triggerfact = d_triggerfact;    minfriclng = (d_minfriclng * samprate) / (1000 * stepsize); // number of steps    maxpause = (d_maxpause * samprate) / (1000 * stepsize); // number of steps    startblip = (d_startblip * samprate) / (1000 * stepsize); // number of steps    endblip = (d_endblip * samprate) / (1000 * stepsize); // number of steps    minvoicelng = (d_minvoicelng * samprate) / (1000 * stepsize); // number of steps    minrise = (d_minrise * samprate) / (1000 * stepsize); // number of steps    lastdpnoise = new float[numdpnoise];    for (i = 0; i < numdpnoise; i++)        lastdpnoise[i] = 0.0;    initendpoint ();} // end endpointer::endpointer/* * ENDPOINTER::~ENDPOINTER - class destructor */endpointer::~endpointer (){    delete []lastdpnoise;} // end endpointer::~endpointer/* * ENDPOINT::INITENDPOINT - initialize the endpoint variables */void endpointer::initendpoint (){    long	i;    epstate = NOSILENCE;    noise = 0.0;    ave = 0.0;    begthresh = 0.0;    endthresh = begthresh;    energy = 0.0;    maxpeak = 0.0;    scnt = 0;    vcnt = 0;    evcnt = 0;    voicecount = 0;    zccnt = 0;    bscnt = 0;    startframe = 0;    endframe = 0;    avescnt = 0;    startsilenceok = False;    ncount = 0;    low = True;    for (i = 0; i < numdpnoise; i++)        lastdpnoise[i] = 0.0;} // end endpointer::initendpointvoid endpointer::setnoise (){    dpnoise = lastdpnoise[1] = lastdpnoise[0];    ncount = 2;} // end endpointer::setnoise/* * ENDPOINT::AVERAGENOISE - get average background noise level and shift noise array */void endpointer::averagenoise (){    long	i;    for (dpnoise = 0.0, i = ncount - 1; i > 0; i--)    {        dpnoise += lastdpnoise[i];        lastdpnoise[i] = lastdpnoise[i - 1];    }    dpnoise = (dpnoise + lastdpnoise[0]) / ncount;    if (ncount < numdpnoise)        ncount ++;} // end endpointer::averagenoise/* * ENDPOINT::ZCPEAKPICK - get the zero cross count and average energy */void endpointer::zcpeakpick(    short	*samples			// raw samples){    long	i;    float	sum,		trigger;    short	*smp;    for (sum = 0.0, i = 0, smp = samples; i < windowsize; i++, smp++)        sum += *smp * *smp;    peakreturn = (sqrt (sum / windowsize));    lastdpnoise[0] = peakreturn;    if (ncount == 0)        dpnoise = peakreturn;			// initial value    trigger = dpnoise * triggerfact;		// schmidt trigger band    for (i = 0, zc = 0, smp = samples; i < windowsize; i++, smp++)    {        if (low)        {            if (*smp > trigger)            {					// up cross                zc++;                low = False;			// search for down cross            }        }        else        {            if (*smp < -trigger)            {					// down cross                zc++;                low = True;			// search for up cross            }        }    }} // end endpointer::zcpeakpick/* * ENDPOINT::GETENDPOINT - get the endpoint tag for the raw samples * The recognition system is designed to operate in real-time.  That is, the * search proceeds in parallel with input of the signal.  The endpoint detection * must, therefore, make a guess as to what the current sample is and correct * errors that may have been made previously. */EPTAG endpointer::getendpoint(    short	*samples			// raw samples){    float	tmp;    zcpeakpick (samples);			// get zc count and peak energy    if (peakreturn > maxpeak)    {        maxpeak = peakreturn;        if ((tmp = maxpeak / endfact) > endthresh)            endthresh = tmp;    }    switch (epstate)    {        case NOSILENCE:				// start, get background silence            ave += peakreturn;            if (++scnt <= 3)            {					// average 3 frame's worth                if (scnt == 1)                    setnoise ();                else                    averagenoise ();                if (dpnoise < minstartsilence)                {                    startsilenceok = True;                    ave += peakreturn;                    avescnt++;                }                return (EP_SILENCE);            }            if (!startsilenceok)            {                epstate = START;                return (EP_NOSTARTSILENCE);            }            ave /= avescnt;            noise = ave;            begthresh = noise + begfact;            endthresh = begthresh;            mnbe = noise * energyfact;            epstate = INSILENCE;            return (EP_SILENCE);        case INSILENCE:            ave = ((3.0 * ave) + peakreturn) / 4.0;            if (peakreturn > begthresh || zc > zcthresh)            {					// looks like start of signal                energy += peakreturn - noise;                if (zc > zcthresh)                    zccnt++;                if (peakreturn > begthresh)                    voicecount++;                if (++vcnt > minrise)                {                    scnt = 0;                    epstate = START;		// definitely start of signal                }                return (EP_SIGNAL);            }            else            {					// still in silence                energy = 0.0;                if (ave < noise)                {                    noise = ave;                    begthresh = noise + begfact;                    endthresh = begthresh;                    mnbe = noise * energyfact;                }                if (vcnt > 0)                {				// previous frame was signal                    if (++bscnt > startblip || zccnt == vcnt)                    {				// Oops, no longer in the signal                        noise = ave;                        begthresh = noise * begfact;                        endthresh = begthresh;                        mnbe = noise * energyfact;                        vcnt = 0;                        zccnt = 0;                        bscnt = 0;                        voicecount = 0;                        startframe = 0;                        return (EP_RESET);	// not in the signal, ignore previous                    }                    return (EP_SIGNAL);                }                zccnt = 0;                return (EP_SILENCE);            }         case START:             if (peakreturn > begthresh || zc > zcthresh)             {					// possible start of signal                 energy += peakreturn - noise;                 if (zc > zcthresh)                     zccnt++;                 if (peakreturn > begthresh)                     voicecount++;                 vcnt += scnt + 1;                 scnt = 0;                 if (energy > mnbe || zccnt > minfriclng)                 {                     epstate = INSIGNAL;                     return (EP_INUTT);                 }                 else                     return (EP_SIGNAL);             }             else             if (++scnt > maxpause)             {					// signal went low again, false start                 vcnt = zccnt = voicecount = 0;                 energy = 0.0;                 epstate = INSILENCE;                 ave = ((3.0 * ave) + peakreturn) / 4.0;                 if (ave < noise + begfact)                 {				// lower noise level                     noise = ave;                     begthresh = noise + begfact;                     endthresh = begthresh;                     mnbe = noise * energyfact;                 }                 return (EP_RESET);             }             else                 return (EP_SIGNAL);        case INSIGNAL:            if (peakreturn > endthresh || zc > zcthresh)            {					// still in signal                if (peakreturn > endthresh)                    voicecount++;                vcnt++;                scnt = 0;                return (EP_SIGNAL);            }            else            {					// below end threshold, may be end                scnt++;                epstate = END;                return (EP_MAYBEEND);            }        case END:            if (peakreturn > endthresh || zc > zcthresh)            {					// signal went up again, may not be end                if (peakreturn > endthresh)                    voicecount++;                if (++evcnt > endblip)                {				// back in signal again                    vcnt += scnt + 1;                    evcnt = 0;                    scnt = 0;                    epstate = INSIGNAL;                    return (EP_NOTEND);                }                else                    return (EP_SIGNAL);            }            else            if (++scnt > maxipause)            {					// silence exceeds inter-word pause                if (vcnt > minuttlng && voicecount > minvoicelng)                    return (EP_ENDOFUTT);	// end of utterance                else                {				// signal is too short                    scnt = vcnt = voicecount = 0;                    epstate = INSILENCE;                    return (EP_RESET);		// false utterance, keep looking                }            }            else            {					// may be an inter-word pause                if (peakreturn == 0)                    return (EP_ENDOFUTT);	// zero filler frame                evcnt = 0;                return (EP_SIGNAL);		// assume still in signal            }    }} // end endpointer::getendpoint/* * ENDPOINT::PRINTVARS: Print variable values */void endpointer::printvars (){    cout << "endpoint variables:" << endl;    cout << "    begfact         " << begfact << endl;    cout << "    endblip         " << endblip << endl;    cout << "    endfact         " << endfact << endl;    cout << "    energyfact      " << energyfact << endl;    cout << "    maxipause       " << maxipause << endl;    cout << "    maxpause        " << maxpause << endl;    cout << "    minfriclng      " << minfriclng << endl;    cout << "    minrise         " << minrise << endl;    cout << "    minstartsilence " << minstartsilence << endl;    cout << "    minuttlng       " << minuttlng << endl;    cout << "    minvoicelng     " << minvoicelng << endl;    cout << "    numdpnoise      " << numdpnoise << endl;    cout << "    samprate        " << samprate << endl;    cout << "    startblip       " << startblip << endl;    cout << "    stepsize        " << stepsize << endl;    cout << "    triggerfact     " << triggerfact << endl;    cout << "    windowsize      " << windowsize << endl;    cout << "    zcthresh        " << zcthresh << endl;} // end endpointer::printvars/* * ENDPOINT::GETTAGNAME - convert the tag to ascii */const char *endpointer::gettagname(    EPTAG	tag){    static const char *tagnames[] =		// must match EPTAG enum in endpoint.h		{			"NONE",			"RESET",			"SILENCE",			"SIGNAL",			"INUTT",			"MAYBEEND",			"ENDOFUTT",			"NOTEND",			"NOSTARTSILENCE"		};    long	ntag = long (tag);    if (ntag < 0 || ntag > long (EP_NOSTARTSILENCE))        return ("UNKNOWN");    else        return (tagnames[ntag]);} // end endpointer::gettagname
endpoint.cpp - 源码说明

本页面展示了「一个语音信号端点检测的程序」中的 endpoint.cpp 源码文件，采用 C++ 编程语言编写，共 462 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与语音信号相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?