📄 fe_endpoint.cpp
字号:
if(m_isSpeechA[i%EPD_FRAME_BUF_SIZE]==EPD_FK_SPEECH) break;
}
/* Let's make the speech recognizer happy by making one-frame dummy speech */
if(m_uttEndX<m_uttBeginX+EPD_DUMMY_FRAMES){
m_uttEndX=m_uttBeginX+EPD_DUMMY_FRAMES;
}
m_localStatus=EPD_STATUS_ENDPOINT;
m_uttEndX=my_max(0,my_min(m_localFrameX,(i+1)+m_config.m_endSilenceFrameN));
if(m_speechSegN==0) m_speechSegN++;
/* printf("EPD: Endpoint detected (begin=%d, end=%d)\n",m_uttBeginX,m_uttEndX); */
for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
return FE_END_POINT; /* in-speech */
}
else if(m_localStatus==EPD_STATUS_WAITING){
/* printf("\n"); Error("Epd","[ERROR] No speech\n"); */
return FE_NO_SPEECH;
}
}
if(m_localStatus==EPD_STATUS_WAITING){
/* find begin-point */
/* printf("EPD: Searching for speech...\n"); */
*frameKind=OneFrame(in);
if(FindBeginPoint(m_localFrameX-1)) {
/* Add extra frames before the speech start-point of localFrameX-startFrameN */
m_uttBeginX=my_max(0,(m_localFrameX-m_config.m_startFrameN-m_config.m_startSilenceFrameN+1));
m_uttEndX=m_localFrameX;
m_localStatus=EPD_STATUS_SPEECH;
/* printf("EPD: Speech detected (begin=%d)\n",m_uttBeginX); */
for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
return FE_SPEECH; /* in-speech */
}
else{
return FE_WAITING;
}
}
else if(m_localStatus==EPD_STATUS_SPEECH){
if(inStatus==FE_EOF){
/* Because subsequent frames are all assumed silence, I regard the frame after the last speech as the end-point. */
m_isSpeechA[m_localFrameX%EPD_FRAME_BUF_SIZE]=EPD_FK_SILENCE;
for(i=m_localFrameX-3;i>=m_localFrameX-m_config.m_endFrameN+1 && i>=0;i--){
if(m_isSpeechA[i%EPD_FRAME_BUF_SIZE]==EPD_FK_SPEECH) break;
}
m_localStatus=EPD_STATUS_PAUSE;
m_uttEndX=my_max(0,my_min(m_localFrameX,(i+1)+m_config.m_endSilenceFrameN));
/* printf("EPD: Endpoint detected (begin=%d, end=%d)\n",m_uttBeginX,m_uttEndX); */
for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
return FE_SPEECH; /* in-speech */
}
/* prepare to detect end-point */
if(m_localFrameX<m_uttBeginX+m_config.m_endFrameN-1){
*frameKind=OneFrame(in);
return FE_NULL;
}
*frameKind=OneFrame(in);
if(FindEndPoint(m_localFrameX-1, m_config.m_endFrameN)) {
m_uttEndX=m_localFrameX-m_config.m_endFrameN;
m_localStatus=EPD_STATUS_PAUSE;
m_uttEndX=m_uttEndX+my_max(0,m_config.m_endSilenceFrameN-m_config.m_endFrameN);
/* printf("EPD: Endpoint detected (begin=%d, end=%d)\n",m_uttBeginX,m_uttEndX); */
for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
return FE_SPEECH; /* in-speech */
}
else{
m_uttEndX=m_localFrameX;
for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
return FE_SPEECH; /* in-speech */
}
}
else if(m_localStatus==EPD_STATUS_PAUSE){
if(inStatus!=FE_EOF){
*frameKind=OneFrame(in);
}
if(FindBeginPoint(m_localFrameX-1)) {
m_uttEndX=m_localFrameX;
m_localStatus=EPD_STATUS_SPEECH;
/* printf("EPD: Speech detected again (begin=%d)\n",m_localFrameX); */
for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
return FE_SPEECH; /* in-speech */
}
else if(m_localFrameX<m_uttBeginX+m_config.m_longPauseFrameN-1){
if(frameX+m_uttBeginX < m_uttEndX){
for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
return FE_SPEECH; /* in-speech */
}
else if(inStatus==FE_EOF){
m_uttEndX=m_uttBeginX+frameX;
m_speechSegN++;
return FE_END_POINT; /* end-point */
}
else{
for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
return FE_PAUSE; /* pause */
}
}
else{
if(FindEndPoint(m_localFrameX-1, m_config.m_longPauseFrameN)) {
m_localStatus=EPD_STATUS_ENDPOINT;
if(m_uttEndX < m_uttBeginX+frameX){
m_speechSegN++;
/* printf("EPD: Endpoint detected (begin=%d, end=%d)\n",m_uttBeginX,m_uttEndX); */
return FE_END_POINT; /* end-point */
}
else{
m_uttEndX=m_uttBeginX+frameX;
for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
return FE_SPEECH; /* in-speech */
}
}
else{
for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
return FE_PAUSE; /* pause */
}
}
}
else if(m_localStatus==EPD_STATUS_ENDPOINT){
if(m_uttBeginX+frameX < m_uttEndX){
for(i=0;i<winSize;i++) out[i]=m_epdSpeech[((m_uttBeginX+frameX)*frameShift+i)%EPD_SPEECH_BUF_SIZE];
return FE_SPEECH; /* in-speech */
}
else{
if(EPD_MULTIPLE_END_POINT){
m_uttBeginX=m_localFrameX;
m_uttEndX=m_localFrameX;
m_localStatus=EPD_STATUS_WAITING;
return FE_WAITING; /* wait for another speech */
}
else{
m_speechSegN++;
return FE_END_POINT; /* end-point */
}
}
}
else{
assert(0);
return FE_UNK_ERROR; /* unknown error */
}
return FE_UNK_ERROR; /* unknown error */
}
int Epd::FindBeginPoint(int endX) {
int i, sum=0;
assert(m_isSpeechA[endX%EPD_FRAME_BUF_SIZE] != -1);
assert(m_localStatus==EPD_STATUS_WAITING || m_localStatus==EPD_STATUS_PAUSE);
if(endX<m_config.m_startFrameN) return 0;
for(i=endX;i>=endX-m_config.m_startFrameN+1;i--){ /* look back startFrameN frames */
sum += (int)m_isSpeechA[i%EPD_FRAME_BUF_SIZE];
}
if(m_isSpeechA[endX%EPD_FRAME_BUF_SIZE]==EPD_FK_SPEECH && sum>=m_config.m_startFrameN-2) {
/* Remove breath noise because it has low pitch frequency. */
for(i=endX, sum=0;i>=endX-m_config.m_startFrameN+1;i--){
if(m_zcrA[i%EPD_FRAME_BUF_SIZE] >= EPD_HIGH_SNR_ZCR_TH) sum++;
}
if(sum > m_config.m_startFrameN/2)
return 1;
else
return 0;
}
else return 0;
}
int Epd::FindEndPoint(int endX, int reqSilN) {
int i, sum=0;
assert(m_isSpeechA[endX%EPD_FRAME_BUF_SIZE] != -1);
assert(m_localStatus==EPD_STATUS_SPEECH || m_localStatus==EPD_STATUS_PAUSE);
for(i=endX;i>=endX-reqSilN+1;i--){ /* look back endFrameN frames */
sum += (int)m_isSpeechA[i%EPD_FRAME_BUF_SIZE];
}
if(m_isSpeechA[endX%EPD_FRAME_BUF_SIZE]==EPD_FK_SILENCE && sum<=1) return 1;
else return 0;
}
EpdFrameKind Epd::OneFrame(const float *s)
{
int i, N=m_config.m_winSize;
float frameEn, frameEnTh, sum, zcr, prev, snrEn, zcrTh;
float x[EPD_MAX_WIN_SIZE];
if(m_localFrameX<EPD_NB_FRAME_THRESHOLD_LTE)
m_lambdaLTE=1-1/(float)(m_localFrameX+1);
else
m_lambdaLTE=(float)EPD_LAMBDA_LTE;
if(m_localFrameX<EPD_NB_FRAME_THRESHOLD_LTE)
m_lambdaZcr=1-1/(float)(m_localFrameX+1);
else
m_lambdaZcr=(float)EPD_LAMBDA_ZCR;
/*
owkwon: Prevent DC level from drifting and high frquency babble noise from surviving noise reduction.
Here a band-pass filter with pass band 500-2800 Hz is used by cascading H1(z)=1-z^(-1)
and H2(z)=z+1-z^(-1). This filter may sometimes lose unvoiced frames like /s/ and /ch/.
The output of the Mel FB in the noise reduction module can be used for this purpose
as the Aurora-3 uses the 2nd, 3rd, 4th FB coefficients. A more elaborate method is needed here.
*/
if(1){
/* DC offset removal, H(z)=(1-z^(-1))/(1-0.999*z^(-1)), y[i]=x[i]-x[i-1]+(1-1/1024)*y[i-1] */
float x0=0, y0=0, a=(1-1/(float)1024);
for (i=0; i<N; i++){
y0 = (float)(x[i] - x0 + a * y0); x0 = x[i]; x[i] = y0; /* in-place output */
}
/* low-pass filtering, H(z)=[1 2 1]/4 */
x[0]=(s[1]+2*s[0]+s[0])/4; x[N-1]=(s[N-1]+2*s[N-1]+s[N-2])/4;
for(i=1;i<N-1; i++) {
x[i]=(s[i+1]+2*s[i]+s[i-1])/4;
}
}
for(i=0, sum=0; i<N; i++) {
sum += x[i]*x[i];
}
frameEn = (float)(0.5+10/log(10)*log(1+sum/N));
/*
owkwon: Added the condition (frameEn < EPD_SPEECH_ENERGY_FLOOR) for babble noise.
The babble noise is not removed completely by noise reduction; there exist
residual speech-like signals.
*/
if((frameEn-m_noiseEn)<EPD_SNR_THRESHOLD_UPD_LTE || m_localFrameX<EPD_MIN_FRAME || (frameEn < EPD_SPEECH_ENERGY_FLOOR)){
if((frameEn<m_noiseEn) || (m_localFrameX<EPD_MIN_FRAME) || (frameEn < EPD_SPEECH_ENERGY_FLOOR)){
m_noiseEn=m_noiseEn+(1-m_lambdaLTE)*(frameEn-m_noiseEn);
}
else{
m_noiseEn=m_noiseEn+(1-m_lambdaLTEhigherE)*(frameEn-m_noiseEn);
}
if(m_noiseEn<EPD_NOISE_ENERGY_FLOOR) m_noiseEn=(float)EPD_NOISE_ENERGY_FLOOR;
m_noiseLevel=(float)(2*sqrt(exp(log(10)/10*(m_noiseEn-0.5))-1));
}
if((frameEn-m_noiseEn)>EPD_SNR_THRESHOLD_UPD_SIGNAL_EN){
if(m_localFrameX>=EPD_MIN_FRAME){
m_signalEn=m_signalEn+(1-m_lambdaSignalE)*(frameEn-m_signalEn);
}
else{
m_signalEn=m_noiseEn+EPD_SNR_THRESHOLD_UPD_SIGNAL_EN;
}
if(m_signalEn-m_noiseEn < m_lastSnr/2)
m_signalEn = m_noiseEn + m_lastSnr/2;
}
else if(frameEn>m_signalEn){
m_signalEn=frameEn+EPD_SNR_THRESHOLD_UPD_SIGNAL_EN;
if(m_signalEn-m_noiseEn < m_lastSnr/2)
m_signalEn = m_noiseEn + m_lastSnr/2;
}
prev = x[0]-m_noiseLevel;
for(i=1, zcr=0; i<N; i++) {
float val = x[i]-m_noiseLevel;
float ztmp=val*prev;
if(ztmp<0) zcr++;
prev=val;
}
if((zcr-m_meanZcr) < EPD_ZCR_THRESHOLD_UPD_LTE || m_localFrameX<EPD_MIN_FRAME){
m_meanZcr=m_meanZcr+(1-m_lambdaZcr)*(zcr-m_meanZcr);
}
snrEn=m_signalEn-m_noiseEn;
{
float slopeEn=(EPD_HIGH_SNR_ENERGY_TH-EPD_LOW_SNR_ENERGY_TH)/(float)(EPD_HIGH_SNR-EPD_LOW_SNR);
float slopeZcr=(EPD_HIGH_SNR_ZCR_TH-EPD_LOW_SNR_ZCR_TH)/(float)(EPD_HIGH_SNR-EPD_LOW_SNR);
frameEnTh=(float)(EPD_LOW_SNR_ENERGY_TH+slopeEn*(snrEn-EPD_LOW_SNR));
if(m_localStatus==EPD_STATUS_SPEECH){
frameEnTh = frameEnTh-EPD_SPEECH_END_ENERGY_OFFSET;
}
frameEnTh=my_max(EPD_LOW_SNR_ENERGY_TH,my_min(frameEnTh,EPD_HIGH_SNR_ENERGY_TH));
zcrTh=(float)(EPD_LOW_SNR_ZCR_TH+slopeZcr*(snrEn-EPD_LOW_SNR));
zcrTh=my_max(EPD_HIGH_SNR_ZCR_TH,my_min(zcrTh,EPD_LOW_SNR_ZCR_TH));
}
#if 0
if(m_localFrameX%10==0){
printf("%f\n",snrEn);
}
#endif
if(frameEn < EPD_SPEECH_ENERGY_FLOOR){
m_flagVAD=EPD_FK_SILENCE;
}
else if(frameEn-m_noiseEn < EPD_LOW_SNR){
m_flagVAD=EPD_FK_SILENCE;
}
else if((frameEn-m_noiseEn)>frameEnTh){
m_flagVAD=EPD_FK_SPEECH;
m_nbSpeechFrame=m_nbSpeechFrame+1;
m_lastSnr=snrEn;
}
else{
if(m_localFrameX>EPD_MIN_FRAME && m_noiseEn < EPD_NOISE_CLEAN && zcr-m_meanZcr >= zcrTh){
m_flagVAD=EPD_FK_SPEECH;
m_nbSpeechFrame=m_nbSpeechFrame+1;
}
else
{
m_flagVAD=EPD_FK_SILENCE;
}
}
m_zcrA[m_localFrameX%EPD_FRAME_BUF_SIZE]=zcr;
m_isSpeechA[m_localFrameX%EPD_FRAME_BUF_SIZE]=m_flagVAD;
m_localFrameX++;
m_absTimeX++;
return m_flagVAD;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -