📄 fe_endpoint.cpp

📁 这是一个语音特征提取的程序源码
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
///////////////////////////////////////////////////////////////////////////////
// This is a part of the Feature program.
// Version: 1.0
// Date: February 22, 2003
// Programmer: Oh-Wook Kwon
// Copyright(c) 2003 Oh-Wook Kwon. All rights reserved. owkwon@ucsd.edu
///////////////////////////////////////////////////////////////////////////////

#include "StdAfx.h"
#include "FE_feature.h"

#include <limits.h>
#include <math.h>
#include <assert.h>


/*-----------------------------------------------
* Definition of End-point Detection parameters 
*----------------------------------------------*/
#ifndef DEFAULT_SAMPLING_RATE
#define DEFAULT_SAMPLING_RATE       16000
#endif
#define DEFAULT_LONG_PAUSE_IN_MS      100 /* default duration of long-pause in millisecond (=100 ms) (changed for kWaves) */

/* For provision when threre is no speech detected */
#define EPD_OUTPUT_ALWAYS		        1 /* write speech even if no speech is detected in order to satisfy the speech recognizer. */
#define EPD_DUMMY_FRAMES                1 /* number of frames to be added when no speech is detected. */
#define EPD_MULTIPLE_END_POINT          1 /* detect multiple speech segments (changed for kWaves) */

/* Use terminology in the Aurora-3 VADNest */
#define EPD_NB_FRAME_THRESHOLD_LTE     10
#define EPD_LAMBDA_LTE               0.97
#define EPD_SNR_THRESHOLD_UPD_LTE       4 /* in dB scale <-- 20 */
#define EPD_MIN_FRAME                  10

/* The minimum power of noise and speech are assumed as 14.5 and 37.5,
which roughly correspond to the amplitude levels of 5 and 71. */
#define EPD_NOISE_ENERGY_FLOOR       14.5 /* ~ 0.5+10*log10(5*5) */
#define EPD_SPEECH_ENERGY_FLOOR      37.5 /* ~ 0.5+10*log10(71*71) */
#define EPD_NOISE_CLEAN              30.5 /* */

/* Input frame is decided as speech if the difference between the log frame energy
   and the log mean energy exceeds the following threshold. The thresholds 40 and 48
   denote that the log frame energy of speech is 6 and 8 times larger than the log 
   mean energy, respectively. Note that the threshold is 15 in the Aurora-3 VADNest. */

/* high noise case: internal microphone */
/* The threshold should be optimized later for the target environment. */
#define EPD_LOW_SNR                    10 /* */
#define EPD_LOW_SNR_ENERGY_TH          12 /* ~ 10*log10(4*4) (changed for kWaves) */
#define EPD_LOW_SNR_ZCR_TH             30 /* */

/* low noise case: headset microphone */
/* We increase the threshold in the denosing case because the input signals have been 
   denoised in the preceding noise reduction module and therefore have larger SNR. */
#define EPD_HIGH_SNR                   20 /* (changed for kWaves) */
#define EPD_HIGH_SNR_ENERGY_TH         20 /* ~ 10*log10(10*10). (changed for kWaves) */
#define EPD_HIGH_SNR_ZCR_TH             4 /* */

/* Update signal energy if frame energy is larger than mean by this value in dB scale */
#define EPD_SNR_THRESHOLD_UPD_SIGNAL_EN    10
#define EPD_LAMBDA_SIGNAL_EN             0.95
#define EPD_LAMBDA_ZCR                   0.98
#define EPD_SPEECH_END_ENERGY_OFFSET        6 /* threshold decrease at the speech end */
#define EPD_ZCR_THRESHOLD_UPD_LTE          20 /* threshold to update mean ZCR */


/*---------------------
 * local variables
 *---------------------*/
static int g_longPauseInMs = 0;


bool Fe::EpdMain(const char *inputfile, int sampleRate, const char *outputfile)
{
	FILE *fi;

	if( (fi = fopen(inputfile, "rb")) == NULL)
		return 0;

	fseek(fi,0L,SEEK_END);
	int fsize = ftell(fi);
	rewind(fi);
	vector<short> sample(fsize/sizeof(short));
	int sampleN = FREAD(&sample[0],sizeof(short),fsize/sizeof(short),fi);
	vector<CSegment> endPointA;
	
	if(!epd_basic(&sample[0], sampleN, sampleRate, endPointA)){
		fclose(fi);
		return false;
	}
	FILE *fo = fopen(outputfile, "wb");
	if(!fo){
		fprintf(stderr, "Cannot open %s\n", outputfile);
		fclose(fi);
		return false;
	}
	fprintf(fo, "#LABEL\n");
	for(int i=0; i<endPointA.size(); i++){
		fprintf(fo,"%f -1 %s\n",endPointA[i].m_fSegment, endPointA[i].m_szLabel.c_str());
	}
	fclose(fi);
	fclose(fo);
	return true;
}


int Fe::epd_basic(short *sampleA, int sampleN, int sampleRate, vector<CSegment>& endPointA)
{
	int i,t,frameX;
	Epd epd;
	epd.Init(sampleRate,0,1,1);
	epd.InitNewUtterance();
	vector<float> in(epd.m_config.m_winSize);
	vector<float> out(epd.m_config.m_winSize);
	int frameSize=epd.m_config.m_winSize;
	int shiftSize=epd.m_config.m_shiftSize;
	int frameN=(int)((sampleN-(frameSize-shiftSize))/float(shiftSize));
	float period=epd.m_config.m_shiftSize/(float)sampleRate;
	FeReturnCode prevStatus=FE_NULL;
	int prevEndPt=-1;
	frameX=0;
	for(t=0;;t++){
		if(t>frameN) break;
		FeReturnCode inStatus;
		EpdFrameKind frameKind;
		if(t<frameN) {
			inStatus=FE_SPEECH;
			for(i=0;i<frameSize;i++) in[i]=sampleA[t*shiftSize+i];
		}
		else{ /* t==frameN */
			inStatus=FE_EOF;
		}
		FeReturnCode status=epd.OneFrame(&in[0], &out[0], frameX, frameSize, inStatus, &frameKind);
		if(status==FE_NULL || status==FE_WAITING){
			continue;
		}
		if(((prevStatus==FE_SPEECH) && (status==FE_END_POINT||status==FE_PAUSE)) || t==frameN){
			if(prevEndPt<epd.m_uttBeginX && epd.m_uttBeginX<epd.m_uttEndX){
				epd_insert_endpoint(endPointA,epd.m_uttBeginX*period,epd.m_uttEndX*period);
				prevEndPt=epd.m_uttEndX;
			}
		}
		prevStatus=status;
		frameX++;
	}
	return endPointA.size();
}


bool Fe::epd_insert_endpoint(vector<CSegment>& endPointA, float startPt, float endPt)
{
	CSegment segment;
	segment.m_fSegment = startPt;
	segment.m_szLabel = "1";
	endPointA.push_back(segment);
	segment.m_fSegment = endPt;
	segment.m_szLabel = "0";
	endPointA.push_back(segment);
	return true;
}


Epd::Epd()
{
	ParmInit(&m_config, DEFAULT_SAMPLING_RATE, 1);
	m_absTimeX=0;  /* initialized only once */	
}


int Epd::ParmInit(EpdParm *epdParm, int samplingRate, int isDenoised)
{
	epdParm->m_sampleRate=samplingRate;
	if(samplingRate==8000){
		epdParm->m_shiftSize        =    80; /* 10 ms. Should be same as frameShift in feature extraction */
		epdParm->m_winSize          =   200; /* 25 ms. */
	}
	else if(samplingRate==11025 || samplingRate==11000){
		epdParm->m_shiftSize        =   110; /* 10 ms. Should be same as frameShift in feature extraction */
		epdParm->m_winSize          =   256; /* 23.27 ms. */
	}
	else if(samplingRate==16000){
		epdParm->m_shiftSize        =   160; /* 10 ms. Should be same as frameShift in feature extraction */
		epdParm->m_winSize          =   400; /* 25 ms. */
	}
	else{
		float shiftMs=10, winMs=25;
		epdParm->m_winSize = (int)(winMs/1000*epdParm->m_sampleRate);
		epdParm->m_shiftSize = (int)(shiftMs/1000*epdParm->m_sampleRate);
	}

	epdParm->m_threshFrameN         =  EPD_NB_FRAME_THRESHOLD_LTE; /* 100 ms for determining thresholds (<= m_startSilenceFrameN) */
	epdParm->m_startFrameN          =    10; /* 100 ms for speech start detection */
	epdParm->m_startSilenceFrameN   =     0; /* add 0 ms of silence before speech start (<= epdParm->m_startFrameN) (changed for kWaves) */
	epdParm->m_endFrameN            =    30; /* 300 ms for pause detection */
	epdParm->m_endSilenceFrameN     =     5; /* add 50 ms of silence after speech end (changed for kWaves) */
	if(g_longPauseInMs>0){
		epdParm->m_longPauseFrameN  = g_longPauseInMs/10;
	}
	else{
		epdParm->m_longPauseFrameN  = DEFAULT_LONG_PAUSE_IN_MS/10; /* 500 ms of silence for speech end detection (<= m_endSilenceFrameN) */
	}
	return 1;
}


int Epd::Init(int samplingRate, int isAudio, int isActive, int isDenoised)
{
	m_isAudio=isAudio;
	m_isActive=isActive;
	ParmInit(&m_config, samplingRate, isDenoised);

	m_uttBeginX=0;
	m_uttEndX=0;
	m_localStatus=EPD_STATUS_WAITING;
	m_localFrameX=0;
	m_sampleEndX=0;
	m_absTimeX=0;  /* initialized only once */	

	/* initial values defined in the Aurora-3 VADNest */
	m_lambdaLTE=(float)EPD_LAMBDA_LTE;
	m_lambdaLTEhigherE=(float)0.99;
	m_lambdaSignalE=(float)EPD_LAMBDA_SIGNAL_EN;
	m_nbSpeechFrame=0;
	m_noiseEn=0;
	m_signalEn=0;
	m_meanZcr=0;
	m_lastSnr = (EPD_HIGH_SNR+EPD_LOW_SNR)/2;
	m_flagVAD=EPD_FK_SILENCE;

	return 1;
}


void Epd::SetMaxPause(int msec)
{
	if(msec>EPD_MAX_PAUSE_IN_MS){
		fprintf(stderr, "[ERROR] Too large pause duration. Must be less than %d\n",EPD_MAX_PAUSE_IN_MS);
		assert(0);
	}
	g_longPauseInMs = msec;
}


Epd::~Epd()
{
}


int Epd::InitNewUtterance()
{
	int i;
	for(i=0;i<EPD_FRAME_BUF_SIZE;i++) m_isSpeechA[i]=(EpdFrameKind)(-1);
	for(i=0;i<EPD_FRAME_BUF_SIZE;i++) m_zcrA[i]=0;
	m_localStatus=EPD_STATUS_WAITING;
	m_uttBeginX = 0;
	m_uttEndX = 0;
	m_localFrameX = 0;
	m_sampleEndX = 0;
	m_speechSegN = 0;
	return 1;
}


int Epd::PutSample(float *sampleA, int sampleN)
{
	int i;
	for(i=0;i<sampleN;i++){
		m_epdSpeech[(m_sampleEndX)%EPD_SPEECH_BUF_SIZE]=(short)sampleA[i];
		m_sampleEndX=(m_sampleEndX+1)%EPD_SPEECH_BUF_SIZE;
	}
	return sampleN;
}


void EpdClose(Epd *epd)
{
}


/* This algorithm is complicated for now and I need more elegant way of endpoint detection. */
FeReturnCode Epd::OneFrame(float *in, float *out, int frameX, int winSize, FeReturnCode inStatus, EpdFrameKind *frameKind)
{
	int i;
	int frameShift=m_config.m_shiftSize;
	int maxDataN=EPD_MAX_RECORD_TIME*m_config.m_sampleRate;

	if(m_localFrameX==0) PutSample(in,winSize);
	else                 PutSample(in+my_max(0,winSize-frameShift),frameShift);

	*frameKind=EPD_FK_SILENCE;
	if(m_isActive==0){
		if(inStatus==FE_EOF){
			m_uttEndX=m_localFrameX;
			return FE_END_POINT; /* end-point detected */
		}
		else{
			for(i=0;i<winSize;i++) out[i]=m_epdSpeech[(m_localFrameX*frameShift+i)%EPD_SPEECH_BUF_SIZE];
			*frameKind=OneFrame(in);		
			m_uttEndX=m_localFrameX;
			return FE_SPEECH; /* in-speech */
		}
	}
	
	if(inStatus==FE_EOF && frameX+m_uttBeginX+m_config.m_startFrameN >= m_uttEndX){
		if(m_isSpeechA[(m_localFrameX-1)%EPD_FRAME_BUF_SIZE]==EPD_FK_SPEECH && m_isSpeechA[(m_localFrameX-2)%EPD_FRAME_BUF_SIZE]==EPD_FK_SPEECH){
			/* printf("\n"); Error("Epd","[ERROR] EOF found in the middle of speech\n"); */
			return FE_EARLY_END;
		}
		else if(m_isAudio==0 && EPD_OUTPUT_ALWAYS && (m_uttBeginX>m_uttEndX || m_speechSegN<1)){
			/* Because subsequent frames are all assumed silence, I regard the frame after the last speech as the end-point. */
			m_isSpeechA[m_localFrameX%EPD_FRAME_BUF_SIZE]=EPD_FK_SILENCE;
			for(i=m_localFrameX-3;i>=m_localFrameX-m_config.m_endFrameN+1 && i>=0;i--){
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -